View source for Action: Binary Search

#||
||
Compatible with: !!(green)**R6.1**!!
Current version: 0.1 !!Draft!!
Credits: ((user:WikiAdmin WikiAdmin))

|{{toc numerate=1}} ||
||#

Binary search allows users to do a global string search on all the content pages of a wiki using **exact** or **regex** search.

  1. **exact** - binary search, include only results that match the exact search term (##BINARY LIKE##)
  1. **regexp** - regex search, interpret the search term as a regular expression (##BINARY REGEXP##)
  1. **union** - union search, include results that match any of the whitespace separated keywords (##AGAINST IN BOOLEAN MODE## & ##LIKE lower##)

##~{{search_binary}}##

**Work in progress**

This puts the ((/Doc/English/Actions/AdminReplace admin_replace)) binary search functionality into a separate action to be available for all users.

file:binary_search_example.png

TODO
  * merge into search action (?)
    * preview and context is different
  * --add message sets for-- DONE
    * %%
'SearchInPages'				=> 'Search in page contents',
'SearchInComments'			=> 'Search in comments',
'SearchInPageTitles'		=> 'Search in page titles',
%% 

action/search_binary.php
%%(php)
<?php

if (!defined('IN_WACKO'))
{
	exit;
}

/*
 TODO:
 - put target with regex data in session
 - improve search, possibly merge with search action
 - allow binary search only for registered users (?)
 */

$info = <<<EOD
Description:
	binary search allows users to do a global string search on all the content pages of a wiki.

Usage:
	{{search_binary}}

Options:
	[page="PageName"]
	[options=1]
	[lang="en"]
	[max=Number]
EOD;

// functions
$search_text = function ($target, $tag, $use_regex, $limit, $filter = [], $deleted = false)
{
	$category_ids	= null;
	$lang			= null;
	$comments		= null;
	$pages			= null;
	$titles			= null;
	$prefix			= $this->prefix;
	extract($filter, EXTR_IF_EXISTS);

	// choose match scope
	if (($pages || $comments) && $titles)
	{
		$match =
			($use_regex
				? '(BINARY a.body REGEXP '	. $this->db->q($target) . ' ' .
				  'OR BINARY a.title REGEXP '	. $this->db->q($target) . ') '
				: '(BINARY a.body LIKE '	. $this->db->q('%' . $target . '%') . ' ' .
				  'OR BINARY a.title LIKE '	. $this->db->q('%' . $target . '%') . ') '
		);
	}
	else if ($pages || $comments)
	{
		$match =
			($use_regex
				? '(BINARY a.body REGEXP '	. $this->db->q($target) . ') '
				: '(BINARY a.body LIKE '	. $this->db->q('%' . $target . '%') . ') '
			);
	}
	else if ($titles)
	{
		$match =
			($use_regex
				? '(BINARY a.title REGEXP '	. $this->db->q($target) . ') '
				: '(BINARY a.title LIKE '	. $this->db->q('%' . $target . '%') . ') '
		);
	}

	// namespace: include tag and tag/%, not tag%
	$selector =
		($category_ids
			? 'LEFT JOIN ' . $prefix . 'category_assignment ca ON (a.page_id = ca.object_id) '
			: '') .
		($tag
			? 'LEFT JOIN ' . $prefix . 'page b ON (a.comment_on_id = b.page_id) '
			: '') .
		'WHERE ' .
			$match .
			($tag
				? 'AND (a.tag = '	. $this->db->q($tag) . ' ' .
				  'OR a.tag LIKE '	. $this->db->q($tag . '/%') . ' ' .
				  'OR b.tag = '		. $this->db->q($tag) . ' ' .
				  'OR b.tag LIKE '	. $this->db->q($tag . '/%') . ') '
				: '') .
			($comments
				? ($pages
					? ''
					: 'AND a.comment_on_id <> 0 ')
				: 'AND a.comment_on_id = 0 ') .
			(!empty($this->sess->replace_unset)
				? 'AND a.page_id NOT IN (' . $this->ids_string($this->sess->replace_unset) . ') '
				: '') .
			($lang
				? 'AND a.page_lang = ' . $this->db->q($lang) . ' '
				: '') .
			($category_ids
				? 'AND ca.category_id IN (' . $this->ids_string($category_ids) . ') ' .
				  'AND ca.object_type_id = ' . (int) OBJECT_PAGE . ' '
				: '') .
			($deleted
				? ''
				: ($tag
					? 'AND (a.deleted <> 1 OR b.deleted <> 1) '
					: 'AND a.deleted <> 1 ')) .
		' ';

	$count = $this->db->load_single(
		'SELECT COUNT(a.page_id) AS n ' .
		'FROM ' . $prefix . 'page a ' .
		$selector, true);

	$pagination = $this->pagination($count['n'], $limit, 'p', ['target' => $target]
		+ (!empty($lang)			? ['lang'			=> $lang]			: [])
		+ (!empty($category_ids)	? ['category_id'	=> $category_ids]	: [])
		+ (!empty($comments)		? ['comments'		=> $comments]		: [])
		+ (!empty($lang)			? ['lang'			=> $lang]			: [])
		+ (!empty($titles)			? ['titles'			=> $titles]			: [])
		+ (!empty($pages)			? ['pages'			=> $pages]			: [])
		+ (!empty($use_regex)		? ['use_regex'		=> $use_regex]		: []));

	// load search results
	$results = $this->db->load_all(
		'SELECT a.page_id, a.owner_id, a.user_id, a.tag, a.title, a.created, a.modified, a.body, a.comment_on_id, a.page_lang, a.page_size, a.comments,
			u.user_name, o.user_name as owner_name ' .
		'FROM ' . $prefix . 'page a ' .
			'LEFT JOIN ' . $prefix . 'user u ON (a.user_id = u.user_id) ' .
			'LEFT JOIN ' . $prefix . 'user o ON (a.owner_id = o.user_id) ' .
		$selector .
		'ORDER BY a.tag ' .
		$pagination['limit']);

	foreach ($results as $result)
	{
		$this->cache_page($result, true);
		$page_ids[] = $result['page_id'];
		$this->page_id_cache[$result['tag']] = $result['page_id'];
	}

	if (!empty($page_ids))
	{
		$this->preload_acl($page_ids);
		$this->preload_categories($page_ids);
	}

	return [$results, $pagination, $count['n']];
};

$space_to_nbsp = function ($message)
{
	$msg = Ut::html($message);

	$msg = preg_replace('/^ /m',	NBSP . ' ', $msg);
	$msg = preg_replace('/ $/m',	' ' . NBSP, $msg);
	$msg = preg_replace('/  /',		NBSP . ' ', $msg);

	return $msg;
};

/**
 * Remove bytes that represent an incomplete Unicode character
 * at the end of string (e.g. bytes of the char are missing)
 */
$remove_bad_char_last = function ($string)
{
	if ($string != '')
	{
		$char	= ord($string[strlen($string) - 1]);
		$match	= [];

		if ($char >= 0xc0)
		{
			// got only the first byte of a multibyte char; remove it
			$string = substr($string, 0, -1);
		}
		else if ($char >= 0x80 &&
			// use the /s modifier so (.*) also matches newlines
			preg_match('/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
				'[\xf0-\xf7][\x80-\xbf]{1,2})$/s', $string, $match)
		)
		{
			// chopped in the middle of a character; remove it
			$string = $match[1];
		}
	}

	return $string;
};

/**
 * Remove bytes that represent an incomplete Unicode character
 * at the start of string (e.g. bytes of the char are missing)
 */
$remove_bad_char_first = function ($string)
{
	if ($string != '')
	{
		$char = ord($string[0]);

		if ($char >= 0x80 && $char < 0xc0)
		{
			// chopped in the middle of a character; remove the whole thing
			$string = preg_replace('/^[\x80-\xbf]+/', '', $string);
		}
	}

	return $string;
};

/**
 * Truncate a string to a specified length in bytes, appending an optional
 * string (e.g. for ellipsis)
 */
$truncate_string = function ($string, $length = 80, $ellipsis = '...') use ($remove_bad_char_first, $remove_bad_char_last)
{
	if (strlen($string) <= abs($length))
	{
		return $string;
	}

	if ($length == 0)
	{
		return $ellipsis;
	}

	if ($length > 0)
	{
		$string = substr($string, 0, $length); // text...
		$string = $remove_bad_char_last($string);
		$string = rtrim($string) . $ellipsis;
	}
	else
	{
		$string = substr($string, $length); // ...text
		$string = $remove_bad_char_first($string);
		$string = $ellipsis . ltrim($string);
	}

	return $string;
};

$extract_context = function ($text, $target, $use_regex = false, $padding = 40) use ($space_to_nbsp, $truncate_string)
{
	// get all indexes
	if ($use_regex)
	{
		$target_q	= str_replace('/', "\\/", $target);
		$target_str	= "/$target_q/Uu";
	}
	else
	{
		$target_q	= preg_quote($target, '/');
		$target_str	= "/$target_q/u";
	}

	preg_match_all($target_str, $text, $matches, PREG_OFFSET_CAPTURE);

	$pos	= $matches[0] ?? [];
	$cuts	= [];

	// [0] => matched string, [1] => offset
	for ($i = 0; $i < count($pos); $i++)
	{
		$index	= $pos[$i][1];
		$len	= strlen($pos[$i][0]);

		// merge to the next if possible
		while (isset($pos[$i + 1][1]))
		{
			if ($pos[$i + 1][1] < $index + $len + $padding * 2)
			{
				$len += $pos[$i + 1][1] - $pos[$i][1];
				$i++;
			}
			else
			{
				// can't merge, exit the inner loop
				break;
			}
		}

		$cuts[] = [$index, $len];
	}

	if (!$use_regex)
	{
		$target_q	= preg_quote($space_to_nbsp($target), '/');
		$target_str	= "/$target_q/u";
	}

	$context = '';

	foreach ($cuts as $cut)
	{
		[$index, $len, ]	= $cut;
		$context_before		= substr($text, 0, $index);
		$context_after		= substr($text, $index + $len);

		$context_before		= $truncate_string($context_before, -$padding);
		$context_after		= $truncate_string($context_after, $padding);

		$context			.= $space_to_nbsp($context_before);
		$snippet			=  $space_to_nbsp(substr($text, $index, $len));

		$context			.= preg_replace($target_str, '<code>\0</code>', $snippet);
		$context			.= $space_to_nbsp($context_after);
	}

	$context = str_replace("\n", '↵', $context);

	return $context;
};

$search_form = function (array $o) use ($tpl)
{
	$category_ids		= $o['filter']['category_ids'];

	$tpl->enter('search_');

	$tpl->target		= $o['target'];
	$tpl->regex			= $o['use_regex'];
	$tpl->tag			= $o['page'];
	$tpl->pages			= $o['pages'];
	$tpl->titles		= $o['titles'];
	$tpl->comments		= $o['comments'];
	$tpl->cancel		= true;

	if ($o['options'])
	{
		$tpl->options		= true;

		$tpl->enter('options_');

		$tpl->c_categories	= $this->show_category_form($this->page_lang, null, OBJECT_PAGE, false, false, $category_ids);

		if ($this->db->multilanguage)
		{
			$languages			= $this->_t('LanguageArray');
			$langs				= $this->http->available_languages();
			$tpl->l_selected	= $o['lang'] ? null : ' selected';

			foreach ($langs as $iso)
			{
				$tpl->l_o_iso	= $iso;
				$tpl->l_o_lang	= $languages[$iso];

				if ($iso == $o['lang'])
				{
					$tpl->l_o_selected	= ' selected';
				}
			}
		}

		$tpl->leave(); // options_
	}

	$tpl->leave(); // search_
};

$show_matches = function ($pages, $pagination, $tcount, $max, array $o) use ($tpl, $search_text, $extract_context)
{
	$tpl->enter('matches_');

	$tpl->replace		= true;
	$tpl->cancel		= true;

	if (count($pages) > 5)
	{
		$tpl->invert	= true;
	}

	$hidden = [
		'categories'		=> implode(',', $o['filter']['category_ids']),
		'comments'			=> $o['comments'],
		'pages'				=> $o['pages'],
		'titles'			=> $o['titles'],
		'lang'				=> $o['lang'],
		'tag'				=> $o['page'],
		'target'			=> $o['target'],
		'use_regex'			=> $o['use_regex'],
	];

	$tpl->offset			= $pagination['offset'] + 1;
	$tpl->pagination_text	= $pagination['text'];
	$tpl->enter('l_');

	foreach ($pages as $n => $p)
	{
		if (!$this->db->hide_locked || $this->has_access('read', $p['page_id']))
		{
			$tpl->delim = $n++;

			$preview	= '';

			// generate preview
			if (($o['pages'] || $o['comments']) && $this->has_access('read', $p['page_id']))
			{
				$preview	= $extract_context($p['body'], $o['target'], $o['use_regex'], $o['padding']);
			}

			$this->sess->replace_set[] = $p['page_id'];

			$tpl->enter('l_');

			$tpl->pageid	= $p['page_id'];
			$tpl->link		= $this->link('/' . $p['tag'], '', ($o['title'] ? $p['title'] : $p['tag']), '', false);
			$tpl->userlink	= $this->user_link($p['user_name'], false, false);
			$tpl->mtime		= $p['modified'];
			$tpl->psize		= $this->factor_multiples($p['page_size'], 'binary', true, true);

			if ($this->db->multilanguage)
			{
				$tpl->lang	= '- ' . $p['page_lang'];
			}

			$tpl->category	= $this->get_categories($p['page_id'], OBJECT_PAGE);
			$tpl->preview	= $preview;

			if ($p['comments'])
			{
				$tpl->comments_n	= $p['comments'];
			}

			$tpl->leave(); // l_
		}
	}

	unset($p);
	$tpl->leave(); // l_

	if ($n)
	{
		$batch_count	= $tcount > $pagination['perpage']
			? $pagination['perpage'] . ' / ' . $tcount
			: $tcount;

		$tpl->mark_diag		= $this->_t(($mode == 'topic' ? 'Topic' : '') . 'SearchResults');
		$tpl->mark_phrase	= $o['msg_target'];
		$tpl->mark_count	= $tcount;	// TODO: count only accessible results
		$tpl->emark			= true;
	}

	$tpl->leave(); // matches_
};

// --------------------------------------------------------------------------------

// set defaults
$help			??= 0;
$lang			??= '';
$max			??= 50;	// (10 ... 100)
$options		??= 0;
$padding		??= 40;
$page			??= '/';
$title			??= 1;

if ($help)
{
	$tpl->help	= $this->help($info, 'search_binary');
	return;
}

/*
if (!$this->is_admin())
{
	return;
}
*/

$action			= (string)	($_POST['_action']			?? null);
$categories		= (string)	($_POST['categories']		?? ($_GET['categories']		?? ''));
$comments		= (bool)	($_POST['comments']			?? ($_GET['comments']		?? 0));
$pages			= (bool)	($_POST['pages']			?? ($_GET['pages']			?? 0));
$titles			= (bool)	($_POST['titles']			?? ($_GET['titles']			?? 0));
$lang			= (string)	($_POST['lang']				?? ($_GET['lang']			?? $lang));
$page			= (string)	($_POST['page']				?? ($_GET['page']			?? $page));

$target			= (string)	trim(($_POST['target']		?? ($_GET['target']			?? '')));
#$phrase || $phrase = trim(($_GET['phrase'] ?? ''));

$use_regex		= (bool)	($_POST['use_regex']		?? ($_GET['use_regex']		?? 0));

$show_search_form	= true;

// remove \r (body contains only \n)
$target				= str_replace("\r\n", "\n", $target);

// visualize line breaks in message sets
$msg_target			= str_replace("\n", '↵', $target);

if ($lang && !$this->known_language($lang))
{
	$lang = '';
	$this->set_message($this->_t('FilterLangNotAvailable'));
}

// empty page parameter, use root not page context
$page			= $page ?: '/';
$tag			= $this->unwrap_link($page);

// category filter
$category_ids	= [];

if ($categories)
{
	$category_ids = explode(',', $categories);
}

foreach ($_POST as $key => $val)
{
	if (preg_match('/^category(\d+)$/', $key, $ids) && $val == 'set')
	{
		$category_ids[] = $ids[1];
	}
}

$o = [
	'comments'			=> $comments,
	'pages'				=> $pages,
	'titles'			=> $titles,
	'filter' => [
		'category_ids'	=> $category_ids,
		'comments'		=> $comments,
		'lang'			=> $lang,
		'pages'			=> $pages,
		'titles'		=> $titles,
	],
	'lang'				=> $lang,
	'msg_target'		=> $msg_target,
	'options'			=> (bool) $options,
	'padding'			=> (int) $padding,
	'page'				=> $page,
	'tag'				=> $tag,
	'target'			=> $target,
	'title'				=> $title,
	'use_regex'			=> $use_regex,
];

// [B] show search matches
if ($target)
{
	$error = null;

	if (!$pages && !$comments && !$titles)
	{
		$error = ['ReplaceTextNoOption', 'hint'];
	}

	if ($error)
	{
		$tpl->message		= $this->show_message($this->_t($error[0]), $error[1], false);
		$show_search_form	= true;
	}
	else if (mb_strlen($target) >= 3)
	{
		// search for target matches
		[$pages, $pagination, $tcount] = $search_text($target, $tag, $use_regex, $max, $o['filter']);

		if ($pages)
		{
			$show_matches($pages, $pagination, $tcount, $max, $o);
		}
		else
		{
			if (  (($pages || $comments)
				|| ($pages || $comments) && $titles))
			{
				$msg = 'ReplaceTextNoMatch';
			}
			else
			{
				$msg = 'ReplaceTextNoTitleMatch';
			}

			$tpl->message = $this->show_message(
				Ut::perc_replace(
					$this->_t($msg),
					'<code>' . Ut::html($msg_target) . '</code>'),
				'note', false);

			$show_search_form = true;
		}
	}
}
else
{
	$show_search_form = true;
}

// [A] search form
if ($show_search_form)
{
	unset(
		$this->sess->replace_set,
		$this->sess->replace_unset);

	$search_form($o);
}

%%

action/template/search_binary.tpl
%%(php)
[ === main === ]
	[ ' help ' ]
	[ ' message ' ]<br>
	[= search _ =
		<form action="[ ' href: ' ]" method="post" name="select_pages">
			[ ' csrf: select_pages ' ]
			<label for="text_target">[ ' _t: SearchFor ' ]</label><br>
			<textarea id="text_target" name="target" class="cols-100" cols="100" rows="5" title="[ ' _t: ReplaceTextGiveTarget ' ]" required>[ ' target | pre ' ]</textarea><br>
			<input type="checkbox" id="use_regex" name="use_regex"[ ' regex | format ' checked' ' ]>
			<label for="use_regex">[ ' _t: ReplaceTextRegex ' ]</label><br><br>
			<label for="cluster" title="[ ' _t: ReplaceTextCluster ' ]">[ ' _t: Namespace ' ]</label><br>
			<input type="text" id="cluster" name="page" value="[ ' tag | e attr ' ]" size="80" maxlength="255"><br>
			<input type="checkbox" id="pages" name="pages"[ ' pages | format ' checked' ' ]>
			<label for="pages">[ ' _t: SearchInPages ' ]</label><br>
			<input type="checkbox" id="comments" name="comments"[ ' comments | format ' checked' ' ]>
			<label for="comments">[ ' _t: SearchInComments ' ]</label><br>
			<input type="checkbox" id="titles" name="titles"[ ' titles | format ' checked' ' ]>
			<label for="titles">[ ' _t: SearchInPageTitles ' ]</label><br><br>
			[= options _ =
				<details open>
					<summary>[ ' _t: OptionalFilters ' ]</summary>
					<div class="form-options">
				[= l _ =
					<label for="language">[ ' _t: AccountLanguage ' ]</label><br>
					<select id="language" name="lang">
						<option value=""[ ' selected ' ]>[ ' _t: Any ' ]</option>
						[= o _ =
							<option value="[ ' iso ' ]"[ ' selected ' ]>[ ' lang ' ] ([ ' iso ' ])</option>
						=]
					</select><br>
				=]
				[= c _ =
					<br>[ ' _t: Categories ' ]:
					[ ' categories ' ]
				=]
				</div>
				</details><br>
			=]
			<br>
			<button type="submit" class="btn-ok">[ ' _t: SearchButton ' ]</button> 
			[ '' cancel '' ]<br>
		</form>
		<br>
	=]
	[= matches _ =
		[= warning _ =
			<p class="msg warning">[ ' msg ' ]</p><br>
		=]
		<br><br>
		[ '' pagination '' ]
		[= mark _ =
			<div class="layout-box">
			<p>
				<span>[ ' diag ' ] «<strong>[ ' phrase | e ' ]</strong>» ([ ' count | e ' ]):</span>
			</p>
		=]

		<ol id="search-results" start="[ ' offset ' ]">
		[= l _ =
			[ ' delim | void ' ]
			<li>
				[ '' l SearchItem '' ]
			</li>
		=]
		</ol>
		[= emark _ =
			[ ' nonstatic ' ]
			</div>
		=]
		[ '' pagination '' ]
		<br>
		<br>
	=]

[= cancel =]
<a href="[ ' href: ' ]" class="btn-link">
	<button type="button" class="btn-cancel">[ ' _t: CancelButton ' ]</button>
</a>


[= SearchItem =]
<h3>
	[ ' link ' ]
</h3>
<span class="search-meta">[ ' mtime | time_format ' ] - [ ' userlink ' ] - [ ' psize ' ] [ ' lang ' ]
[= comments =
	- <img src="[ ' db: theme_url ' ]icon/spacer.png" class="btn-comment btn-sm">[ ' n ' ]
=]
</span><br>
[ ' preview | nl2br ' ]

[ ' category ' ]

[= pagination =]
<nav class="pagination">[ ' text ' ]</nav>

%%

===Using regular expressions===
See ((/Doc/English/Actions/AdminReplace AdminReplace)) action

===See also===
  * ((/Doc/English/Actions/AdminReplace AdminReplace)) - allows administrators to do a global string find-and-replace on all the content pages
  * ((/Dev/PatchesHacks/MassRegexReplace MassRegexReplace)) - Mass edit using regular expressions