This is the historical archive of the now-inactive 'grubstreet' list.
Discussion on OpenGuides development has now moved to OpenGuides-Dev. Discussion on The Open Guide to London now takes place on OpenGuides-London.
[grubstreet] standalone search facility for Usemod wikis

[prev] [thread] [next] [lurker] [Date index for 2002/10/21]
From: Ivor Williams
Subject: [grubstreet] standalone search facility for Usemod wikis
Date: 16:02 on 21 Oct 2002
http://london.pm.org/pipermail/london.pm/Week-of-Mon-20021014/014414.html

The archive shows that the attachment has been garbled into 64 bits - I don't 
know if its just the archive that's garbled or whether people were not able to 
read the attachment at all. I certainly got a mail with an attachment I could 
read.

Just in case it was garbled, I have cut 'n' pasted it below (not as an 
attachment, in case my outgoing SMTP is garbling it).

Can you please put it up somewhere so we can all try it against a full sized 
Grubstreet wiki. Either the real thing, or the wikibeta site.


Ivor

__CODE FOLLOWS__  search.pl
#!perl -T
#
# Usemod Wiki search facility
#
# Ivor Williams: October 2002
#
# Change the variable $wikiroot and $wikimain below for your site specific
# datadir path and wiki url.
#
# The prog uses bog-standard CGI.pm stuff. The main program can be tailored for 
look and feel.

use strict;
use warnings;

use CGI qw(:standard *ol);
use CGI::Carp qw(fatalsToBrowser);	#Remove fatalsToBrowser if paranoid

use Parse::RecDescent;
use Data::Dumper;
use File::Spec::Functions qw(:ALL);

use vars qw($wikiroot $wikimain %wikitext);

$wikiroot = "c:\\programswikidb";
$wikimain = "wiki.pl";

# sub matched_items is called with parse tree. Uses horrible subname 
concatenation - this
# could be rewritten to us OO instead and be much neater. This would be a major 
refactor:
# need to address design issues - petterns?

sub matched_items {
	my $op = shift;
	
	no strict 'subs';
	goto &{matched_.$op};
}

# sub load_wiki_text is used to load the entire wiki into global hash 
%wikitext. This is a
# performance hit everytime the search is used. Looks OK for small wikis. Could 
replace
# this sub with something that makes %wikitext persistent, using a tied hash - 
issue of when
# to reload comes up.
#
# Note: uses File::Spec routines so as to be fully portable. Works on Windoze, 
should work on
# Unix just as well. Uses the patent Perlmonks superslurper trick.

sub load_wiki_text {

# glob for topics

	my $wikiglob = catdir($wikiroot,'page','*','*.db');
	
	for (glob $wikiglob) {
		my ($dev,$dir,$term) = splitpath($_);
		
		$term =~ s/\.db//;
		$wikitext{$term} = do { local (@ARGV, $/) = $_; <> }; # slurp entire file
	}

# glob for subtopics

	$wikiglob = catdir($wikiroot,'page','*','*','*.db');
		
	for (glob $wikiglob) {
		my ($dev,$dir,$term) = splitpath($_);
		my @sd = splitdir($dir);

		$term =~ s/\.db//;
		$term = $sd[-2].'/'.$term;
		$wikitext{$term} = do { local (@ARGV, $/) = $_; <> }; # slurp entire file
	}
}

# Output HTML search form with appropriate headers.

print header,start_html("Usemod Search"),
					h1("Usemod Search"),"\n";
print start_form, textfield(
					-name=>'search',
					-size=>50,
					-maxlength=>80),end_form,"\n";

# Do we have an existing search? if so, do it.

my $q = CGI->new;
my $srh = $q->param('search');

RESULTS:
{
	if ($srh) {
		
# Check for only valid characters in tainted search param
# (quoted literals are OK, as they are escaped)

		if ($srh !~ /^("[^"]*"|[a-z _\-'&|()!*%])+$/i) { #"
			print h1("Search expression contains invalid character");
			last RESULTS;
		}

		load_wiki_text();

# Build RecDescent grammar for search syntax.
# Note: '&' and '|' can be replaced with other non-alpha. This may be needed if
# you need to call the script from METHOD=GET (as & is a separator)
# Also, word: pattern could be changed to include numbers and handle locales 
properly.
# However, quoted literals are usually good enough for most odd characters.
	
		my $parse = Parse::RecDescent->new(q{

			search: list eostring {$return = $item[1]}

			list: <leftop: comby '|' comby>
				{$return = (@{$item[1]}>1) ? ['OR', @{$item[1]}] : $item[1][0]}

			comby: <leftop: term '&' term>
				{$return = (@{$item[1]}>1) ? ['AND', @{$item[1]}] : $item[1][0]}

			term: '(' list ')' {$return = $item[2]}
			|		'!' term {$return = ['NOT', @{$item[2]}]}
			|		'"' /[^"]*/ '"' {$return = ['literal', $item[2]]}
			|		word(s) {$return = ['word', @{$item[1]}]}

			word: /[a-z'*%]+/i {$return = $item[1]}
			
			eostring: /^\Z/

		}) or die $@;

# Turn search string into parse tree
	
		my $tree = $parse->search($srh) or (print h1("Search syntax error")),last 
RESULTS;
#		print pre(Dumper($tree));

		print hr,h2('Search Results'),start_ol,"\n";

# Apply search and display results
	
		my %results = matched_items(@$tree);
		for (keys %results) {
			print p(li(a({href=>$wikimain."?$_"},b($_)) . br . $results{$_}));
		}
		print end_ol,"\n";
	}
}

print end_html,"\n";

######### End of main program.

# Utility routines to actually do the search

sub do_search {
	my $wmatch = shift;

# Build regexp from parameter. Gobble upto 60 characters of context either 
side.
# \xb3 is the special usemod field separator.

	my $wexp = qr/\W([^\xb3]{0,60}\W$wmatch\W[^\xb3]{0,60})(?:\W|$)/is;
	my %res;
	
	while (my ($k,$v) = each %wikitext) {
		my $out = '';
		for ($v =~ /$wexp/g) {
			my $match .= "...$_...";
			$match =~ s!$wmatch!<b>$&</b>!i;
			$out .= $match;
		}
		
		$res{$k} = $out if $out;
	}
	
	%res;
}

# matched_word - we have a list of adjacent words. Words are allowed to contain
# wildcards * and %

sub matched_word {

	my $wmatch = join '[^a-z\xb3]+',@_;
	$wmatch =~ s/%/[a-z]/g;
	$wmatch =~ s/\*/[a-z]*/g;

	do_search($wmatch);
}

# matched_literal - we have a literal.

sub matched_literal {
	my $lit = shift;
	
	do_search(quotemeta $lit);
}

# matched_AND - we have a combination of subsearches.

sub matched_AND {

# Do all the searches

	my @comby_res = map {my %match_hash = matched_items(@$_);\%match_hash} @_;

# Use the first one's results as a basis for the output hash
	
	my %out=%{shift @comby_res};

# Zap out any entries which do not appear in one of the other searches.
	
	PAGE:
	for my $page (keys %out) {
		for (@comby_res) {
			(delete $out{$page}),next PAGE if !exists $_->{$page};
		}
		
		$out{$page} .= $_->{$page} for @comby_res;
	}
	
	%out;
}

# matched_OR - we have a list of subsearches

sub matched_OR {

# Do all the searches

	my @list_res = map {my %match_hash = matched_items(@$_);\%match_hash} @_;
	
	my %out;

# Apply union of hashes, merging any duplicates.
	
	for (@list_res) {
		while (my ($k,$v) = each %$_) {
			$out{$k} .= $v;
		}
	}
	
	%out;
}

# matched_NOT - Form complement of hash against %wikitext

sub matched_NOT {

	my %excludes = matched_items(@_);
	my %out = map {$_=>''} keys %wikitext;

	delete $out{$_} for keys %excludes;
	%out;
}

=head1 NAME

search.pl - Enhancement to Usemod Wiki for searches

=head1 SYNOPSIS

Invoked as a CGI script.

Examples of search strings:

king's head
king's head&fullers
coach and horses|crown and anchor
(vegetarian|vegan)&takeaway
category restaurants&!expensive

=head1 DESCRIPTION

This script presents a single search form when called. The search string is 
parsed with a full
RecDescent grammar, and the wiki pages are searched for matches.

Borrowing from Perl (or C) & represents AND, | represents OR, and ! represents 
NOT.

For notes about how to tailor this script, please read the script comments.

=head1 AUTHOR

I. Williams, E<lt>ivor.williams@xxxxxxx.xx.xxx<gt>

=head1 SEE ALSO

L<usemod>.



-- 
grubstreet mailing list
http://london.openguides.org/old-list-archives/
[grubstreet] standalone search facility for Usemod wikis Ivor Williams 16:02 on 21 Oct 2002
http://london.pm.org/pipermail/london.pm/Week-of-Mon-20021014/014414.html