--- search.pl Mon Mar 4 14:40:53 2002
+++ /home/cgi/search/search.pl Tue Apr 9 13:43:07 2002
@@ -21,6 +21,13 @@
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
# USA
+# MODIFIED 2001-03-04, 2002-04-09 by daniel.naber@t-online.de:
+# -The * can be used as a joker in search terms. It can be used
+# anywhere in the term (even several * can be used per term)
+# -This version only supports "Match all words". Everything else
+# (like "-" in front of a word) will be ignored.
+
+
# Comment in the next two lines to log and show how long searches take:
#use Time::HiRes qw ();
#my $start_time = [Time::HiRes::gettimeofday];
@@ -81,7 +88,7 @@
tie %dates_db, $db_package, $DATES_DB_FILE, O_RDONLY, 0755 or die "Cannot open $DATES_DB_FILE: $!";
tie %terms_db, $db_package, $TERMS_DB_FILE, O_RDONLY, 0755 or die "Cannot open $TERMS_DB_FILE: $!";
-my (@force, @not, @other);
+my (@force);
my (@docs, @valid_docs);
my %answer;
@@ -110,15 +117,11 @@
} else {
# initialize everything with empty values (because we might run under mod_perl)
@force = ();
- @not = ();
- @other = ();
@docs = ();
@valid_docs = ();
%answer = ();
- if (create_query()) { #if some valid documents exist
- apply_booleans();
- answer_query();
- }
+ create_query();
+ answer_query();
my $html = cast_template();
if( $ENV{'REQUEST_METHOD'} ) {
print $query->header;
@@ -216,6 +219,7 @@
}
return $content_new;
}
+
sub create_query {
my $query_str = cleanup($query->param('q'));
my $mode = cleanup($query->param('mode'));
@@ -223,71 +227,52 @@
my $buffer;
my ($sterm, $nterm);
+ my @tmpforce = ();
+ my $ct = 0;
foreach my $term (@terms) {
- $buffer = normalize($term);
+ $ct++;
+ my $org_term = $term;
+ $term = normalize($term);
if( grep(/^\Q$term\E$/, @stopwords) ) {
- push(@stopwords_ignored, $term);
+ push(@stopwords_ignored, $org_term);
next;
}
- foreach my $nterm (split " ",$buffer) {
- $sterm = stem($nterm);
+ $term =~ s/^\s+//;
+ $term =~ s/\s+$//;
+ $sterm = stem($term);
+ @tmpforce = ();
+ if ( $mode eq 'all' && $term !~ m/^(\+|\-)/ ) {
# For "Match all words" just add a "+" to every term that has no operator:
- if ( $mode eq 'all' && $term !~ m/^(\+|\-)/ ) {
- $term = '+'.$term;
- }
- if ($term =~ /^\+/) {
- if ($terms_db{$sterm}) {
- push @force, $terms_db{$sterm};
- } else {
- return 0; # this term was not found, we can stop already
+ $term = '+'.$term;
+ }
+ if( $term =~ /\*/ ) {
+ $term =~ s/\*/.*/g; # use '*' as Joker
+ foreach $listterm (keys %terms_db) {
+ #debug: print "$listterm =~ m/^$term\$/i ($terms_db{$listterm})
\n";
+ if( $listterm =~ m/^$term$/i ) {
+ #print "** match
\n";
+ %v = unpack("S*", $inv_index_db{$terms_db{$listterm}});
+ push(@tmpforce, keys %v);
+ push(@force, $terms_db{$listterm});
}
- } elsif ($term =~ /^\-/) {
- push @not, $terms_db{$sterm} if $terms_db{$sterm};
- } else {
- push @other, $terms_db{$sterm} if $terms_db{$sterm};
}
+ } else {
+ if( $terms_db{$sterm} ) {
+ %v = unpack("S*", $inv_index_db{$terms_db{$sterm}});
+ push(@tmpforce, keys %v);
+ push(@force, $terms_db{$sterm});
+ }
}
- }
-
- return 1;
-}
-
-sub apply_booleans {
- #locate the valid documents by applying the booleans
- my ($term_id, $doc_id, $first_doc_id);
- my %v = ();
- my @ary = ();
- my @not_docs = ();
-
- my %not_docs = ();
- map { $not_docs{$_} = 1 } @not_docs;
-
- foreach $term_id (@not) {
- %v = unpack("S*", $inv_index_db{$term_id});
- foreach $doc_id (keys %v) {
- push @not_docs, $doc_id unless exists $not_docs{$doc_id};
- }
- }
-
- if (@force) {
- $first_doc_id = pop @force;
- %v = unpack("S*", $inv_index_db{$first_doc_id});
- @valid_docs = keys %v;
- foreach $term_id (@force) {
- %v = unpack("S*", $inv_index_db{$term_id});
- @ary = keys %v;
- @valid_docs = intersection(\@valid_docs, \@ary);
+ if( $ct > 1 ) {
+ @valid_docs = intersection(\@valid_docs, \@tmpforce);
+ } else {
+ @valid_docs = @tmpforce;
}
- push @force, $first_doc_id;
- } else {
- @valid_docs = keys %docs_db;
}
-
- @valid_docs = minus(\@valid_docs, \@not_docs);
}
sub answer_query {
- my @term_ids = (@force, @other);
+ my @term_ids = (@force);
my %valid_docs = ();
map { $valid_docs{$_} = 1 } @valid_docs;
@@ -538,7 +523,7 @@
$buffer =~ s/(\b\d+\b)|(<[^>]*>)/ /gs;
}
- $buffer =~ tr/a-zA-Z0-9_/ /cs;
+ $buffer =~ tr/a-zA-Z0-9_*/ /cs; # joker: don't filter '*'
return lc $buffer;
}