Action: Binary Search

Compatible with: R6.1
Current version: 0.1 Draft
Credits: WikiAdmin

Binary search allows users to do a global string search on all the content pages of a wiki using exact or regex search.


  1. exact - binary search, include only results that match the exact search term (BINARY LIKE)
  2. regexp - regex search, interpret the search term as a regular expression (BINARY REGEXP)
  3. union - union search, include results that match any of the whitespace separated keywords (AGAINST IN BOOLEAN MODE & LIKE lower)

{{search_binary}}


Work in progress


This puts the admin_replace binary search functionality into a separate action to be available for all users.


Example for binary search using regex


TODO

  • merge into search action (?)
    • preview and context is different
  • add message sets for DONE
    • 'SearchInPages'				=> 'Search in page contents',
      'SearchInComments'			=> 'Search in comments',
      'SearchInPageTitles'		=> 'Search in page titles',	

action/search_binary.php

<?php

if (!defined('IN_WACKO'))
{
    exit;
}

/*
 TODO:
 - put target with regex data in session
 - improve search, possibly merge with search action
 - allow binary search only for registered users (?)
 */

$info = <<<EOD
Description:
    binary search allows users to do a global string search on all the content pages of a wiki.

Usage:
    {{search_binary}}

Options:
    [page="PageName"]
    [options=1]
    [lang="en"]
    [max=Number]
EOD;

// functions
$search_text = function ($target, $tag, $use_regex, $limit, $filter = [], $deleted = false)
{
    $category_ids    = null;
    $lang            = null;
    $comments        = null;
    $pages            = null;
    $titles            = null;
    $prefix            = $this->prefix;
    extract($filter, EXTR_IF_EXISTS);

    // choose match scope
    if (($pages || $comments) && $titles)
    {
        $match =
            ($use_regex
                ? '(BINARY a.body REGEXP '    . $this->db->q($target) . ' ' .
                  'OR BINARY a.title REGEXP '    . $this->db->q($target) . ') '
                : '(BINARY a.body LIKE '    . $this->db->q('%' . $target . '%') . ' ' .
                  'OR BINARY a.title LIKE '    . $this->db->q('%' . $target . '%') . ') '
        );
    }
    else if ($pages || $comments)
    {
        $match =
            ($use_regex
                ? '(BINARY a.body REGEXP '    . $this->db->q($target) . ') '
                : '(BINARY a.body LIKE '    . $this->db->q('%' . $target . '%') . ') '
            );
    }
    else if ($titles)
    {
        $match =
            ($use_regex
                ? '(BINARY a.title REGEXP '    . $this->db->q($target) . ') '
                : '(BINARY a.title LIKE '    . $this->db->q('%' . $target . '%') . ') '
        );
    }

    // namespace: include tag and tag/%, not tag%
    $selector =
        ($category_ids
            ? 'LEFT JOIN ' . $prefix . 'category_assignment ca ON (a.page_id = ca.object_id) '
            : '') .
        ($tag
            ? 'LEFT JOIN ' . $prefix . 'page b ON (a.comment_on_id = b.page_id) '
            : '') .
        'WHERE ' .
            $match .
            ($tag
                ? 'AND (a.tag = '    . $this->db->q($tag) . ' ' .
                  'OR a.tag LIKE '    . $this->db->q($tag . '/%') . ' ' .
                  'OR b.tag = '        . $this->db->q($tag) . ' ' .
                  'OR b.tag LIKE '    . $this->db->q($tag . '/%') . ') '
                : '') .
            ($comments
                ? ($pages
                    ? ''
                    : 'AND a.comment_on_id <> 0 ')
                : 'AND a.comment_on_id = 0 ') .
            (!empty($this->sess->replace_unset)
                ? 'AND a.page_id NOT IN (' . $this->ids_string($this->sess->replace_unset) . ') '
                : '') .
            ($lang
                ? 'AND a.page_lang = ' . $this->db->q($lang) . ' '
                : '') .
            ($category_ids
                ? 'AND ca.category_id IN (' . $this->ids_string($category_ids) . ') ' .
                  'AND ca.object_type_id = ' . (int) OBJECT_PAGE . ' '
                : '') .
            ($deleted
                ? ''
                : ($tag
                    ? 'AND (a.deleted <> 1 OR b.deleted <> 1) '
                    : 'AND a.deleted <> 1 ')) .
        ' ';

    $count = $this->db->load_single(
        'SELECT COUNT(a.page_id) AS n ' .
        'FROM ' . $prefix . 'page a ' .
        $selector, true);

    $pagination = $this->pagination($count['n'], $limit, 'p', ['target' => $target]
        + (!empty($lang)            ? ['lang'            => $lang]            : [])
        + (!empty($category_ids)    ? ['category_id'    => $category_ids]    : [])
        + (!empty($comments)        ? ['comments'        => $comments]        : [])
        + (!empty($lang)            ? ['lang'            => $lang]            : [])
        + (!empty($titles)            ? ['titles'            => $titles]            : [])
        + (!empty($pages)            ? ['pages'            => $pages]            : [])
        + (!empty($use_regex)        ? ['use_regex'        => $use_regex]        : []));

    // load search results
    $results = $this->db->load_all(
        'SELECT a.page_id, a.owner_id, a.user_id, a.tag, a.title, a.created, a.modified, a.body, a.comment_on_id, a.page_lang, a.page_size, a.comments,
            u.user_name, o.user_name as owner_name ' .
        'FROM ' . $prefix . 'page a ' .
            'LEFT JOIN ' . $prefix . 'user u ON (a.user_id = u.user_id) ' .
            'LEFT JOIN ' . $prefix . 'user o ON (a.owner_id = o.user_id) ' .
        $selector .
        'ORDER BY a.tag ' .
        $pagination['limit']);

    foreach ($results as $result)
    {
        $this->cache_page($result, true);
        $page_ids[] = $result['page_id'];
        $this->page_id_cache[$result['tag']] = $result['page_id'];
    }

    if (!empty($page_ids))
    {
        $this->preload_acl($page_ids);
        $this->preload_categories($page_ids);
    }

    return [$results, $pagination, $count['n']];
};

$space_to_nbsp = function ($message)
{
    $msg = Ut::html($message);

    $msg = preg_replace('/^ /m',    NBSP . ' ', $msg);
    $msg = preg_replace('/ $/m',    ' ' . NBSP, $msg);
    $msg = preg_replace('/  /',        NBSP . ' ', $msg);

    return $msg;
};

/**
 * Remove bytes that represent an incomplete Unicode character
 * at the end of string (e.g. bytes of the char are missing)
 */
$remove_bad_char_last = function ($string)
{
    if ($string != '')
    {
        $char    = ord($string[strlen($string) - 1]);
        $match    = [];

        if ($char >= 0xc0)
        {
            // got only the first byte of a multibyte char; remove it
            $string = substr($string, 0, -1);
        }
        else if ($char >= 0x80 &&
            // use the /s modifier so (.*) also matches newlines
            preg_match('/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
                '[\xf0-\xf7][\x80-\xbf]{1,2})$/s', $string, $match)
        )
        {
            // chopped in the middle of a character; remove it
            $string = $match[1];
        }
    }

    return $string;
};

/**
 * Remove bytes that represent an incomplete Unicode character
 * at the start of string (e.g. bytes of the char are missing)
 */
$remove_bad_char_first = function ($string)
{
    if ($string != '')
    {
        $char = ord($string[0]);

        if ($char >= 0x80 && $char < 0xc0)
        {
            // chopped in the middle of a character; remove the whole thing
            $string = preg_replace('/^[\x80-\xbf]+/', '', $string);
        }
    }

    return $string;
};

/**
 * Truncate a string to a specified length in bytes, appending an optional
 * string (e.g. for ellipsis)
 */
$truncate_string = function ($string, $length = 80, $ellipsis = '...') use ($remove_bad_char_first, $remove_bad_char_last)
{
    if (strlen($string) <= abs($length))
    {
        return $string;
    }

    if ($length == 0)
    {
        return $ellipsis;
    }

    if ($length > 0)
    {
        $string = substr($string, 0, $length); // text...
        $string = $remove_bad_char_last($string);
        $string = rtrim($string) . $ellipsis;
    }
    else
    {
        $string = substr($string, $length); // ...text
        $string = $remove_bad_char_first($string);
        $string = $ellipsis . ltrim($string);
    }

    return $string;
};

$extract_context = function ($text, $target, $use_regex = false, $padding = 40) use ($space_to_nbsp, $truncate_string)
{
    // get all indexes
    if ($use_regex)
    {
        $target_q    = str_replace('/', "\\/", $target);
        $target_str    = "/$target_q/Uu";
    }
    else
    {
        $target_q    = preg_quote($target, '/');
        $target_str    = "/$target_q/u";
    }

    preg_match_all($target_str, $text, $matches, PREG_OFFSET_CAPTURE);

    $pos    = $matches[0] ?? [];
    $cuts    = [];

    // [0] => matched string, [1] => offset
    for ($i = 0; $i < count($pos); $i++)
    {
        $index    = $pos[$i][1];
        $len    = strlen($pos[$i][0]);

        // merge to the next if possible
        while (isset($pos[$i + 1][1]))
        {
            if ($pos[$i + 1][1] < $index + $len + $padding * 2)
            {
                $len += $pos[$i + 1][1] - $pos[$i][1];
                $i++;
            }
            else
            {
                // can't merge, exit the inner loop
                break;
            }
        }

        $cuts[] = [$index, $len];
    }

    if (!$use_regex)
    {
        $target_q    = preg_quote($space_to_nbsp($target), '/');
        $target_str    = "/$target_q/u";
    }

    $context = '';

    foreach ($cuts as $cut)
    {
        [$index, $len, ]    = $cut;
        $context_before        = substr($text, 0, $index);
        $context_after        = substr($text, $index + $len);

        $context_before        = $truncate_string($context_before, -$padding);
        $context_after        = $truncate_string($context_after, $padding);

        $context            .= $space_to_nbsp($context_before);
        $snippet            =  $space_to_nbsp(substr($text, $index, $len));

        $context            .= preg_replace($target_str, '<code>\0</code>', $snippet);
        $context            .= $space_to_nbsp($context_after);
    }

    $context = str_replace("\n", '↵', $context);

    return $context;
};

$search_form = function (array $o) use ($tpl)
{
    $category_ids        = $o['filter']['category_ids'];

    $tpl->enter('search_');

    $tpl->target        = $o['target'];
    $tpl->regex            = $o['use_regex'];
    $tpl->tag            = $o['page'];
    $tpl->pages            = $o['pages'];
    $tpl->titles        = $o['titles'];
    $tpl->comments        = $o['comments'];
    $tpl->cancel        = true;

    if ($o['options'])
    {
        $tpl->options        = true;

        $tpl->enter('options_');

        $tpl->c_categories    = $this->show_category_form($this->page_lang, null, OBJECT_PAGE, false, false, $category_ids);

        if ($this->db->multilanguage)
        {
            $languages            = $this->_t('LanguageArray');
            $langs                = $this->http->available_languages();
            $tpl->l_selected    = $o['lang'] ? null : ' selected';

            foreach ($langs as $iso)
            {
                $tpl->l_o_iso    = $iso;
                $tpl->l_o_lang    = $languages[$iso];

                if ($iso == $o['lang'])
                {
                    $tpl->l_o_selected    = ' selected';
                }
            }
        }

        $tpl->leave(); // options_
    }

    $tpl->leave(); // search_
};

$show_matches = function ($pages, $pagination, $tcount, $max, array $o) use ($tpl, $search_text, $extract_context)
{
    $tpl->enter('matches_');

    $tpl->replace        = true;
    $tpl->cancel        = true;

    if (count($pages) > 5)
    {
        $tpl->invert    = true;
    }

    $hidden = [
        'categories'        => implode(',', $o['filter']['category_ids']),
        'comments'            => $o['comments'],
        'pages'                => $o['pages'],
        'titles'            => $o['titles'],
        'lang'                => $o['lang'],
        'tag'                => $o['page'],
        'target'            => $o['target'],
        'use_regex'            => $o['use_regex'],
    ];

    $tpl->offset            = $pagination['offset'] + 1;
    $tpl->pagination_text    = $pagination['text'];
    $tpl->enter('l_');

    foreach ($pages as $n => $p)
    {
        if (!$this->db->hide_locked || $this->has_access('read', $p['page_id']))
        {
            $tpl->delim = $n++;

            $preview    = '';

            // generate preview
            if (($o['pages'] || $o['comments']) && $this->has_access('read', $p['page_id']))
            {
                $preview    = $extract_context($p['body'], $o['target'], $o['use_regex'], $o['padding']);
            }

            $this->sess->replace_set[] = $p['page_id'];

            $tpl->enter('l_');

            $tpl->pageid    = $p['page_id'];
            $tpl->link        = $this->link('/' . $p['tag'], '', ($o['title'] ? $p['title'] : $p['tag']), '', false);
            $tpl->userlink    = $this->user_link($p['user_name'], false, false);
            $tpl->mtime        = $p['modified'];
            $tpl->psize        = $this->factor_multiples($p['page_size'], 'binary', true, true);

            if ($this->db->multilanguage)
            {
                $tpl->lang    = '- ' . $p['page_lang'];
            }

            $tpl->category    = $this->get_categories($p['page_id'], OBJECT_PAGE);
            $tpl->preview    = $preview;

            if ($p['comments'])
            {
                $tpl->comments_n    = $p['comments'];
            }

            $tpl->leave(); // l_
        }
    }

    unset($p);
    $tpl->leave(); // l_

    if ($n)
    {
        $batch_count    = $tcount > $pagination['perpage']
            ? $pagination['perpage'] . ' / ' . $tcount
            : $tcount;

        $tpl->mark_diag        = $this->_t(($mode == 'topic' ? 'Topic' : '') . 'SearchResults');
        $tpl->mark_phrase    = $o['msg_target'];
        $tpl->mark_count    = $tcount;    // TODO: count only accessible results
        $tpl->emark            = true;
    }

    $tpl->leave(); // matches_
};

// --------------------------------------------------------------------------------

// set defaults
$help            ??= 0;
$lang            ??= '';
$max            ??= 50;    // (10 ... 100)
$options        ??= 0;
$padding        ??= 40;
$page            ??= '/';
$title            ??= 1;

if ($help)
{
    $tpl->help    = $this->help($info, 'search_binary');
    return;
}

/*
if (!$this->is_admin())
{
    return;
}
*/

$action            = (string)    ($_POST['_action']            ?? null);
$categories        = (string)    ($_POST['categories']        ?? ($_GET['categories']        ?? ''));
$comments        = (bool)    ($_POST['comments']            ?? ($_GET['comments']        ?? 0));
$pages            = (bool)    ($_POST['pages']            ?? ($_GET['pages']            ?? 0));
$titles            = (bool)    ($_POST['titles']            ?? ($_GET['titles']            ?? 0));
$lang            = (string)    ($_POST['lang']                ?? ($_GET['lang']            ?? $lang));
$page            = (string)    ($_POST['page']                ?? ($_GET['page']            ?? $page));

$target            = (string)    trim(($_POST['target']        ?? ($_GET['target']            ?? '')));
#$phrase || $phrase = trim(($_GET['phrase'] ?? ''));

$use_regex        = (bool)    ($_POST['use_regex']        ?? ($_GET['use_regex']        ?? 0));

$show_search_form    = true;

// remove \r (body contains only \n)
$target                = str_replace("\r\n", "\n", $target);

// visualize line breaks in message sets
$msg_target            = str_replace("\n", '↵', $target);

if ($lang && !$this->known_language($lang))
{
    $lang = '';
    $this->set_message($this->_t('FilterLangNotAvailable'));
}

// empty page parameter, use root not page context
$page            = $page ?: '/';
$tag            = $this->unwrap_link($page);

// category filter
$category_ids    = [];

if ($categories)
{
    $category_ids = explode(',', $categories);
}

foreach ($_POST as $key => $val)
{
    if (preg_match('/^category(\d+)$/', $key, $ids) && $val == 'set')
    {
        $category_ids[] = $ids[1];
    }
}

$o = [
    'comments'            => $comments,
    'pages'                => $pages,
    'titles'            => $titles,
    'filter' => [
        'category_ids'    => $category_ids,
        'comments'        => $comments,
        'lang'            => $lang,
        'pages'            => $pages,
        'titles'        => $titles,
    ],
    'lang'                => $lang,
    'msg_target'        => $msg_target,
    'options'            => (bool) $options,
    'padding'            => (int) $padding,
    'page'                => $page,
    'tag'                => $tag,
    'target'            => $target,
    'title'                => $title,
    'use_regex'            => $use_regex,
];

// [B] show search matches
if ($target)
{
    $error = null;

    if (!$pages && !$comments && !$titles)
    {
        $error = ['ReplaceTextNoOption', 'hint'];
    }

    if ($error)
    {
        $tpl->message        = $this->show_message($this->_t($error[0]), $error[1], false);
        $show_search_form    = true;
    }
    else if (mb_strlen($target) >= 3)
    {
        // search for target matches
        [$pages, $pagination, $tcount] = $search_text($target, $tag, $use_regex, $max, $o['filter']);

        if ($pages)
        {
            $show_matches($pages, $pagination, $tcount, $max, $o);
        }
        else
        {
            if (  (($pages || $comments)
                || ($pages || $comments) && $titles))
            {
                $msg = 'ReplaceTextNoMatch';
            }
            else
            {
                $msg = 'ReplaceTextNoTitleMatch';
            }

            $tpl->message = $this->show_message(
                Ut::perc_replace(
                    $this->_t($msg),
                    '<code>' . Ut::html($msg_target) . '</code>'),
                'note', false);

            $show_search_form = true;
        }
    }
}
else
{
    $show_search_form = true;
}

// [A] search form
if ($show_search_form)
{
    unset(
        $this->sess->replace_set,
        $this->sess->replace_unset);

    $search_form($o);
}

action/template/search_binary.tpl

[ === main === ]
    [ ' help ' ]
    [ ' message ' ]<br>
    [= search _ =
        <form action="[ ' href: ' ]" method="post" name="select_pages">
            [ ' csrf: select_pages ' ]
            <label for="text_target">[ ' _t: SearchFor ' ]</label><br>
            <textarea id="text_target" name="target" class="cols-100" cols="100" rows="5" title="[ ' _t: ReplaceTextGiveTarget ' ]" required>[ ' target | pre ' ]</textarea><br>
            <input type="checkbox" id="use_regex" name="use_regex"[ ' regex | format ' checked' ' ]>
            <label for="use_regex">[ ' _t: ReplaceTextRegex ' ]</label><br><br>
            <label for="cluster" title="[ ' _t: ReplaceTextCluster ' ]">[ ' _t: Namespace ' ]</label><br>
            <input type="text" id="cluster" name="page" value="[ ' tag | e attr ' ]" size="80" maxlength="255"><br>
            <input type="checkbox" id="pages" name="pages"[ ' pages | format ' checked' ' ]>
            <label for="pages">[ ' _t: SearchInPages ' ]</label><br>
            <input type="checkbox" id="comments" name="comments"[ ' comments | format ' checked' ' ]>
            <label for="comments">[ ' _t: SearchInComments ' ]</label><br>
            <input type="checkbox" id="titles" name="titles"[ ' titles | format ' checked' ' ]>
            <label for="titles">[ ' _t: SearchInPageTitles ' ]</label><br><br>
            [= options _ =
                <details open>
                    <summary>[ ' _t: OptionalFilters ' ]</summary>
                    <div class="form-options">
                [= l _ =
                    <label for="language">[ ' _t: AccountLanguage ' ]</label><br>
                    <select id="language" name="lang">
                        <option value=""[ ' selected ' ]>[ ' _t: Any ' ]</option>
                        [= o _ =
                            <option value="[ ' iso ' ]"[ ' selected ' ]>[ ' lang ' ] ([ ' iso ' ])</option>
                        =]
                    </select><br>
                =]
                [= c _ =
                    <br>[ ' _t: Categories ' ]:
                    [ ' categories ' ]
                =]
                </div>
                </details><br>
            =]
            <br>
            <button type="submit" class="btn-ok">[ ' _t: SearchButton ' ]</button>&nbsp;
            [ '' cancel '' ]<br>
        </form>
        <br>
    =]
    [= matches _ =
        [= warning _ =
            <p class="msg warning">[ ' msg ' ]</p><br>
        =]
        <br><br>
        [ '' pagination '' ]
        [= mark _ =
            <div class="layout-box">
            <p>
                <span>[ ' diag ' ] «<strong>[ ' phrase | e ' ]</strong>» ([ ' count | e ' ]):</span>
            </p>
        =]

        <ol id="search-results" start="[ ' offset ' ]">
        [= l _ =
            [ ' delim | void ' ]
            <li>
                [ '' l SearchItem '' ]
            </li>
        =]
        </ol>
        [= emark _ =
            [ ' nonstatic ' ]
            </div>
        =]
        [ '' pagination '' ]
        <br>
        <br>
    =]

[= cancel =]
<a href="[ ' href: ' ]" class="btn-link">
    <button type="button" class="btn-cancel">[ ' _t: CancelButton ' ]</button>
</a>


[= SearchItem =]
<h3>
    [ ' link ' ]
</h3>
<span class="search-meta">[ ' mtime | time_format ' ] - [ ' userlink ' ] - [ ' psize ' ] [ ' lang ' ]
[= comments =
    - <img src="[ ' db: theme_url ' ]icon/spacer.png" class="btn-comment btn-sm">[ ' n ' ]
=]
</span><br>
[ ' preview | nl2br ' ]

[ ' category ' ]

[= pagination =]
<nav class="pagination">[ ' text ' ]</nav>

1. Using regular expressions

See AdminReplace action

2. See also

  • AdminReplace - allows administrators to do a global string find-and-replace on all the content pages
  • MassRegexReplace - Mass edit using regular expressions

Show Files (1 file)