--- /home/dnaber/perlfect/search-3.30/conf.pl	2002-03-19 16:11:41.000000000 +0100
+++ conf.pl	2002-11-06 19:38:41.000000000 +0100
@@ -256,6 +256,7 @@
 $URLS_DB_FILE      = $DATA_DIR.'urls';
 $SIZES_DB_FILE     = $DATA_DIR.'sizes';
 $TERMS_DB_FILE     = $DATA_DIR.'terms';
+$POS_DB_FILE       = $DATA_DIR.'pos';
 $DF_DB_FILE        = $DATA_DIR.'df';
 $TF_DB_FILE        = $DATA_DIR.'tf';
 $CONTENT_DB_FILE   = $DATA_DIR.'content';
@@ -269,11 +270,12 @@
 $URLS_TMP_DB_FILE      = $DATA_DIR.'urls_tmp';
 $SIZES_TMP_DB_FILE     = $DATA_DIR.'sizes_tmp';
 $TERMS_TMP_DB_FILE     = $DATA_DIR.'terms_tmp';
+$POS_TMP_DB_FILE       = $DATA_DIR.'pos_tmp';
 $CONTENT_TMP_DB_FILE   = $DATA_DIR.'content_tmp';
 $DESC_TMP_DB_FILE      = $DATA_DIR.'desc_tmp';
 $TITLES_TMP_DB_FILE    = $DATA_DIR.'titles_tmp';
 $DATES_TMP_DB_FILE     = $DATA_DIR.'dates_tmp';
 
 # Official version number.
-$VERSION = "3.30";
+$VERSION = "3.30-phrase-search";
 1;
--- /home/dnaber/perlfect/search-3.30/indexer.pl	Tue Mar 19 15:17:35 2002
+++ indexer.pl	Sun Sep 29 03:01:34 2002
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-#$rcs = ' $Id: indexer.pl,v 1.70 2002/03/19 14:17:35 daniel Exp $ ' ;	
+#$rcs = ' $Id: indexer.pl,v 1.71 2002/05/16 15:45:38 daniel Exp $ ' ;	
 
 # Perlfect Search
 #
@@ -93,7 +93,6 @@
 my $TN  = 0;  #terms number (unique terms)
 my $TN_non_unique = 0;  #terms number
 
-my @no_index;
 my $no_index_count = 0;
 
 print "Checking for old temp files...\n";
@@ -109,6 +108,7 @@
 my %dates_db;       # document id -> date of last modification
 my %content_db;     # document id -> start of the document's content (to show context of matches)
 my %terms_db;       # term -> term id
+my %pos_db;         # doc_id/term_id -> pos1,pos2,...
 # The following two hashes are temporary and will not be saved to disk:
 my %df_db;          # term id -> number of occurences of this term in all documents
 my %tf_db;          # term id -> list of pairs: (document id, number of occurences in this document)
@@ -122,6 +122,7 @@
   tie %titles_db,    $db_package, $TITLES_TMP_DB_FILE, O_CREAT|O_RDWR, 0755    or die "Cannot open $TITLES_TMP_DB_FILE: $!";
   tie %dates_db,     $db_package, $DATES_TMP_DB_FILE, O_CREAT|O_RDWR, 0755     or die "Cannot open $DATES_TMP_DB_FILE: $!";
   tie %terms_db,     $db_package, $TERMS_TMP_DB_FILE, O_CREAT|O_RDWR, 0755     or die "Cannot open $TERMS_TMP_DB_FILE: $!"; 
+  tie %pos_db,       $db_package, $POS_TMP_DB_FILE, O_CREAT|O_RDWR, 0755       or die "Cannot open $POS_TMP_DB_FILE: $!"; 
   tie %df_db,        $db_package, $DF_DB_FILE, O_CREAT|O_RDWR, 0755            or die "Cannot open $DF_DB_FILE: $!";
   tie %tf_db,        $db_package, $TF_DB_FILE, O_CREAT|O_RDWR, 0755            or die "Cannot open $TF_DB_FILE: $!";
 }
@@ -160,6 +161,7 @@
   untie %titles_db;
   untie %dates_db;
   untie %terms_db;
+  untie %pos_db;
   untie %df_db;
   untie %tf_db;
   print "Removing unused db files:\n";
@@ -185,6 +187,7 @@
   save_db($TITLES_TMP_DB_FILE, %titles_db);
   save_db($DATES_TMP_DB_FILE, %dates_db);
   save_db($TERMS_TMP_DB_FILE, %terms_db);
+  save_db($POS_TMP_DB_FILE, %pos_db);
 }
 
 print "Renaming newly created db files...\n";
@@ -248,19 +251,40 @@
   get_tag_contents("title", $buffer, $TITLE_WEIGHT);
   get_headline_contents($buffer);
   # to search for text in the follwing meta tags:
-  ${$buffer} .= " ".get_meta_content("description", $buffer, $META_WEIGHT);
-  ${$buffer} .= " ".get_meta_content("keywords", $buffer, $META_WEIGHT);
-  ${$buffer} .= " ".get_meta_content("author", $buffer, $META_WEIGHT);
+  
+  get_meta_content("description", $buffer, $META_WEIGHT);
+  get_meta_content("keywords", $buffer, $META_WEIGHT);
+  get_meta_content("author", $buffer, $META_WEIGHT);
   # to search for images' alt texts:
   get_alt_texts($buffer);
   normalize($buffer);
+  #print "***\$buffer=$$buffer\n\n";
   
-  foreach (split " ", ${$buffer}) {
-    next if( $stopwords{$_} );	# ignore stopwords
-    $_ = substr $_, 0, $STEMCHARS if $STEMCHARS;
-    if (length $_ >= $MINLENGTH) {
-      $term_id = record_term($_);
-      ++$tf{$term_id};
+  my $word_count = 0;
+  foreach my $temp_term (split " ", ${$buffer}) {
+    my ($occ) = ($temp_term =~ m/^\[(\d+)\]/);		# weight
+    $temp_term =~ s/^\[(\d+)\]//;
+    next if( $stopwords{$temp_term} );	# ignore stopwords
+    $temp_term = substr $temp_term, 0, $STEMCHARS if $STEMCHARS;
+    if (length($temp_term) >= $MINLENGTH) {
+      # hack: extract number of occurences for ranking:
+      # save the term:
+      $term_id = record_term($temp_term);
+      if( ! defined($occ) ) {
+        ++$tf{$term_id};
+      } else {
+        $tf{$term_id} = 0 if( ! defined($tf{$term_id}) );
+        $tf{$term_id} = $tf{$term_id} + $occ;
+      }
+      #print "###\$temp_term=$temp_term\n";
+      $word_count++;
+      # store position for phrase search:
+      my $id = $doc_id."/".$term_id;
+      if( defined($pos_db{$id}) ) {
+        $pos_db{$id} = $pos_db{$id} . ",$word_count";
+      } else {
+        $pos_db{$id} = "$word_count";
+      }
     }
   }
   
@@ -292,6 +316,7 @@
     print "\b >" unless $count % $step;
     foreach $doc_id (keys %tdf) {
       #print "weight = $tdf{$doc_id} * log ($DN / $df)\n";
+      #print "# \$term_id = $term_id : $tdf{$doc_id}\n";
       $weight = $tdf{$doc_id} * log ($DN / $df);
       $weight = int($weight*100);
       $weight = 65535 if ( $weight > 65535 );	# we're limited to 16 bit
@@ -319,13 +344,13 @@
   }
 
   # Replace HTML tags (and maybe numbers) by spaces:
-  if ($INDEX_NUMBERS) {
+  if ( 1 || $INDEX_NUMBERS) {			# fixme: make this configurable again
     ${$buffer} =~ s/(<[^>]*>)/ /gs;
   } else {
     ${$buffer} =~ s/(\b\d+\b)|(<[^>]*>)/ /gs;
   }
 
-  ${$buffer} =~ tr/a-zA-Z0-9_/ /cs;
+  ${$buffer} =~ tr/a-zA-Z0-9_\[\]/ /cs;
   ${$buffer} = lc ${$buffer};
 }
 
@@ -377,7 +402,7 @@
   my @desc_ary;
 
   # Save Description or beginning of body:
-  $desc = get_meta_content("description", $buffer, 1);
+  $desc = get_meta_content_nochange("description", $buffer, 1);
   if( ! $desc || $CONTEXT_SIZE ) {
     $cleanbody = get_cleaned_body($buffer, $file);
   }
@@ -409,9 +434,17 @@
   }
 }
 
+# Get the weighted content part of a certain meta tag.
+sub get_meta_content {
+  my $name = $_[0];
+  my $buffer = $_[1];
+  my $weight = $_[2];
+  ${$buffer} =~ s/<META\s+name\s*=\s*[\"\']?$name[\"\']?\s+content=[\"\'](.*?)[\"\'][\/\s]*>/addWeightTags($1, $weight)/igse;
+}
+
 # Get the content part for a certain meta tag. Weight with
 # a certain factor by just repeating the result that often.
-sub get_meta_content {
+sub get_meta_content_nochange {
   my $name = $_[0];
   my $buffer = $_[1];
   my $weight = $_[2];
@@ -426,36 +459,45 @@
   my $buffer = $_[0];
   my $alt_texts = "";
   while( ${$buffer} =~ m/alt\s*=\s*[\"\'](.*?)[\"\']/gis ) {
-  	$alt_texts .= " ".$1;
+    $alt_texts .= " ".$1;
   }
   ${$buffer} .= $alt_texts;
 }
 
-# Add the contents of a certain tag, weighted by just repeating these contents
-# to $buffer.
+# Get the weighted contents of a certain tag.
 sub get_tag_contents {
   my $tag = $_[0];
   my $buffer = $_[1];
   my $weight = $_[2];
   my $tag_content = "";
-  while( ${$buffer} =~ m/<$tag.*?>(.*?)<\/$tag>/igs ) {
-    $tag_content .= (" ".$1) x $weight;
-  }
-  ${$buffer} .= $tag_content;
+  ${$buffer} =~ s/<$tag.*?>(.*?)<\/$tag>/addWeightTags($1, $weight)/igse;
 }
 
-# Add the contents of all headline levels, weighted by just repeating these contents
-# to $buffer.
+# Add the weighted contents of all headline levels.
 sub get_headline_contents {
   my $buffer = $_[0];
   my $level;
   my $headlines = "";
   for( $level = 1; $level <= 6; $level++ ) {
-    while( ${$buffer} =~ m/<h$level.*?>(.*?)<\/h$level>/igs ) {
-      $headlines .= (" ".$1) x $H_WEIGHT{$level};
+    ${$buffer} =~ s/<h$level.*?>(.*?)<\/h$level>/addWeightTags($1, $H_WEIGHT{$level})/igse;
+  }
+}
+
+# Replace "term" by "[n]term" where n is the weight of that term
+sub addWeightTags {
+  my $content = $_[0];
+  my $weight = $_[1];
+  my $new_content = "";
+  return "" if( ! $content || $content =~ m/^\s+$/ || $weight == 0 );
+  foreach my $term (split(/\s+/, $content)) {
+    if( $weight == 1 ) {
+      $new_content .= $term.' ';
+    } elsif( $weight > 1 ) {
+      $new_content .= "[$weight]".$term.' ';
     }
   }
-  ${$buffer} .= $headlines;
+  #print "\$new_content=$new_content ($weight)\n";
+  return $new_content;
 }
 
 # Checks if a file is PDF depending on the filename. If so, write it to a
@@ -575,6 +617,7 @@
 # files can still be used while the new ones are being created.
 sub rename_db {
   my @files = (
+	       [$POS_TMP_DB_FILE, $POS_DB_FILE],
 	       [$TERMS_TMP_DB_FILE, $TERMS_DB_FILE],
 	       [$DOCS_TMP_DB_FILE, $DOCS_DB_FILE],
 	       [$URLS_TMP_DB_FILE, $URLS_DB_FILE],
@@ -635,9 +678,11 @@
   $zz = $TITLES_DB_FILE;
   $zz = $DATES_DB_FILE;
   $zz = $TERMS_DB_FILE;
+  $zz = $POS_DB_FILE;
   $zz = $DOCS_DB_FILE;
   $zz = $URLS_DB_FILE;
   $zz = $TMP_DIR;
   $zz = $CONTENT_DB_FILE;
   $zz = $INDEX_NUMBERS;
+  $zz = $VERSION;
 }
--- /home/dnaber/perlfect/search-3.30/search.pl	2002-03-04 14:40:53.000000000 +0100
+++ search.pl	2002-11-06 19:37:57.000000000 +0100
@@ -1,5 +1,5 @@
 #!/usr/bin/perl -w
-#$rcs = ' $Id: search.pl,v 1.73 2002/03/04 13:40:53 daniel Exp $ ' ;
+#$rcs = ' $Id: search.pl,v 1.79 2002/08/23 11:55:19 daniel Exp $ ' ;
 
 # Perlfect Search
 #
@@ -70,6 +70,7 @@
 my %titles_db;
 my %dates_db;
 my %terms_db;
+my %pos_db;
 
 tie %inv_index_db, $db_package, $INV_INDEX_DB_FILE, O_RDONLY, 0755 or die "Cannot open $INV_INDEX_DB_FILE: $!";   
 tie %docs_db,      $db_package, $DOCS_DB_FILE, O_RDONLY, 0755 or die "Cannot open $DOCS_DB_FILE: $!";   
@@ -80,6 +81,7 @@
 tie %titles_db,    $db_package, $TITLES_DB_FILE, O_RDONLY, 0755    or die "Cannot open $TITLES_DB_FILE: $!";   
 tie %dates_db,     $db_package, $DATES_DB_FILE, O_RDONLY, 0755 or die "Cannot open $DATES_DB_FILE: $!";   
 tie %terms_db,     $db_package, $TERMS_DB_FILE, O_RDONLY, 0755 or die "Cannot open $TERMS_DB_FILE: $!";   
+tie %pos_db,       $db_package, $POS_DB_FILE, O_RDONLY, 0755 or die "Cannot open $POS_DB_FILE: $!";   
 
 my (@force, @not, @other);
 my (@docs, @valid_docs);
@@ -115,13 +117,11 @@
     @docs = ();
     @valid_docs = ();
     %answer = ();
-    if (create_query()) { #if some valid documents exist
-      apply_booleans();
-      answer_query();
-    }  
+    my @term_ids = create_query();  #if some valid documents exist
+    answer_query(@term_ids);
     my $html = cast_template();
     if( $ENV{'REQUEST_METHOD'} ) {
-      print $query->header;
+      print "Content-Type: text/html\n\n";
     }
     print $html;
     log_query();
@@ -146,7 +146,7 @@
       my $http_user_agent = LWP::UserAgent->new;
       my $dummy;
       ($dummy,$dummy,$dummy,$content) = get_url($http_user_agent, $url);
-      if( !$content ) {		# fixme: !defined($content)
+      if( ! $content ) {
         $content = "Error: could not retrieve '".cleanup($url)."'\n";
       }
     } else {
@@ -166,9 +166,12 @@
   my @terms = split(" ", $query_str);
   my $ct = 0;
   foreach my $term (@terms) {
-    # TODO: add some text at top of <body>?
+    # TODO: add some text at top of <body> (Google style)?
     # fixme: umlaut highlighting!
     $term = normalize_special_chars($term);
+    if( is_ignored(remove_accents($term)) ) {
+      next;
+    }
     $content = highlighthtml($term, $content, $HIGHLIGHT_COLORS[$ct]);
     $ct++;
     if( $ct >= scalar(@HIGHLIGHT_COLORS) ) {
@@ -180,7 +183,7 @@
   # Insert our own <base> tag:
   $url = cleanup($url);
   my ($count_repl) = ($content =~ s/<head>/<head>\n<base href="$url">\n/is);
-  if( ! $count_repl ) {		# fixme: defined()
+  if( ! defined($count_repl) ) {
     # maybe the HTML is broken and has no <head>:
     $content = "<base href=\"$url\">\n".$content;
   }
@@ -199,11 +202,12 @@
   my $content = shift;
   my $color = shift;
   my $content_new = "";
-  my @comments = split(/(<!--.*?-->)/igs, $content);
+  my @comments = split(/(<!--.*?-->)/is, $content);
+  my $in_ignore = 0;
   foreach my $c (@comments) {
-    my @tags = split(/(<.*?>)/igs, $c);
+    my @tags = split(/(<.*?>)/is, $c);
     foreach my $part (@tags) {
-      if( $part !~ m/^</ ) {
+      if( $part !~ m/^</ && ! $in_ignore ) {
         $part = normalize_special_chars($part);
         $part =~ s/\b($term)\b/<highlight>$1<\/highlight>/igs;
         # repair possibly damaged entities:
@@ -211,88 +215,239 @@
         # now really highlight:
         $part =~ s/<highlight>($term)<\/highlight>/<span style="color:black;background:$color">$1<\/span>/igs;
       }
+      if( $part =~ /<title/i ) {
+        $in_ignore = 1;
+      } else {
+        $in_ignore = 0;
+      }
       $content_new .= $part;
     }
   }
   return $content_new;
 }
-sub create_query {
-  my $query_str = cleanup($query->param('q'));
-  my $mode = cleanup($query->param('mode'));
-  my @terms = split " ", $query_str;
-  my $buffer;
-  my ($sterm, $nterm);
-  
-  foreach my $term (@terms) {
-    $buffer = normalize($term);
-    if( grep(/^\Q$term\E$/, @stopwords) ) {
-      push(@stopwords_ignored, $term);
-      next;
-    }
-    foreach my $nterm (split " ",$buffer) {
-      $sterm = stem($nterm);
-      # For "Match all words" just add a "+" to every term that has no operator:
-      if ( $mode eq 'all' && $term !~ m/^(\+|\-)/ ) {
-        $term = '+'.$term;
-      }
-      if ($term =~ /^\+/) {
-        if ($terms_db{$sterm}) {
-          push @force, $terms_db{$sterm};
-        } else {
-          return 0;    # this term was not found, we can stop already
-        }
-      } elsif ($term =~ /^\-/) {
-        push @not, $terms_db{$sterm} if $terms_db{$sterm};
-      } else {
-        push @other, $terms_db{$sterm} if $terms_db{$sterm};
+
+sub is_ignored {
+  my $buffer = shift;
+  if( grep(/^\Q$buffer\E$/, @stopwords) || length($buffer) < $MINLENGTH ) {
+    if( ! grep(/^\Q$buffer\E$/, @stopwords_ignored) ) {
+      # don't show words twice:
+	  my $t = normalize($buffer);
+      if( $t ) {
+        push(@stopwords_ignored, $t);
       }
     }
+    return 1;
+  } else {
+    return 0;
   }
-
-  return 1;
 }
 
-sub apply_booleans {
-  #locate the valid documents by applying the booleans
-  my ($term_id, $doc_id, $first_doc_id);
-  my %v = ();
-  my @ary = ();
-  my @not_docs = ();
-
-  my %not_docs = ();
-  map { $not_docs{$_} = 1 } @not_docs;
-
-  foreach $term_id (@not) {
-    %v = unpack("S*", $inv_index_db{$term_id});
-    foreach $doc_id (keys %v) {
-      push @not_docs, $doc_id unless exists $not_docs{$doc_id};
-    }
-  }
-  
-  if (@force) {
-    $first_doc_id = pop @force;
-    %v  = unpack("S*", $inv_index_db{$first_doc_id});
-    @valid_docs = keys %v; 
-    foreach $term_id (@force) {
-      %v = unpack("S*", $inv_index_db{$term_id});
-      @ary = keys %v;
-      @valid_docs = intersection(\@valid_docs, \@ary);
-    }
-    push @force, $first_doc_id;
-  } else {
-    @valid_docs = keys %docs_db;
-  }
+sub create_query {
+	my $query_str = $query->param('q');
+	my $mode = cleanup($query->param('mode'));
+	#my @terms = split(/\s+/, $query_str);
+	my $buffer;
+	my ($sterm, $nterm);
+
+	my %other_docs;
+	my %force_docs;
+	my %not_docs;
+	my @force_docs = ();
+
+	my @term_ids = ();
+
+	###
+	#print "Content-Type: text/html\n\n";
+	
+	my @terms_split = split(/(["\s])/, $query_str);
+	my $inside_quote = 0;
+	my $sub_terms = ();
+	my $i = 0;
+	my $prefix = "";
+	my $can_force = 0;
+	my @terms_split_new = ();
+	foreach my $term (@terms_split) {
+		if( $term eq '"' ) {
+			push(@terms_split_new, $term);
+		} else {
+			foreach my $term2 (split(/\s+/, normalize($term))) {
+				if( !is_ignored(normalize($term2)) ) {
+					push(@terms_split_new, $term2);
+				}
+			}
+		}
+	}
+	foreach my $term (@terms_split_new) {
+    	if( $term eq '"' && ! $inside_quote ) {
+			# quote starts here
+			$inside_quote = 1;
+			#print "#$i: $terms_split[$i-1]<br>";
+			# For "Match all words" just add a "+" to every term that has no operator:
+			if ( $mode eq 'all' && $terms_split[$i-1] !~ m/^(\+|\-)/ ) {
+				$prefix = '+';
+			} elsif ( $terms_split[$i-1] =~ m/^(\+|\-)/ ) {
+				$prefix = $terms_split[$i-1];
+			}
+    	} elsif( $term eq '"' && $inside_quote ) {
+			# phrase search
+			# a quote in the query ends here
+			# Get the docs_ids and the positions that the first sub term (the first term in a search phrase) is in:
+
+			# save term id's - needed for ranking later (in answer_query())
+			foreach my $sub_term (@sub_terms) {
+				my $id = $terms_db{normalize($sub_term)};		# first term of phrase
+				push(@term_ids, $id);
+			}
+
+			my $first_term = shift @sub_terms;
+			my $first_term_id = $terms_db{normalize($first_term)};		# first term of phrase
+			my %first_doc_ids_hash = unpack("S*", $inv_index_db{$first_term_id});
+			my @first_doc_ids = keys(%first_doc_ids_hash);
+			my $first_pos_id = $first_doc_ids[0].'/'.$first_term_id;
+			my $first_terms_positions = $pos_db{$first_pos_id};
+			#print "##\$first_terms_positions=$first_terms_positions ($first_pos_id)<br>";
+			my @found_docs = ();
+
+			### FOR ALL DOCUMENTS where the first sub term occurs:
+			foreach my $doc_id (@first_doc_ids) {
+
+				# find the positions of the first term wrt. to this matched document:
+				my $pos_id = $doc_id.'/'.$first_term_id;
+				my $first_terms_positions = $pos_db{$pos_id};
+				my @first_pos = split(/,/, $first_terms_positions);
+				#print "##~\$first_terms_positions=$first_terms_positions ($pos_id)<br>";
+					
+				### FOR ALL sub terms after the first term in the search phrase:
+				my $j = 1;
+				foreach my $sub_term (@sub_terms) {
+					my $term_id = $terms_db{normalize($sub_term)};
+					my %v = unpack("S*", $inv_index_db{$term_id});
+					my @sub_term_doc_ids = keys(%v);
+
+					my $pos_id = $doc_id.'/'.$term_id;
+					my $terms_positions = $pos_db{$pos_id};
+
+					#print "##~~\$terms_positions=$terms_positions ($pos_id) f. '$sub_term'<br>";
+					if( ! defined($terms_positions) ) {
+						next;
+					}
+					my @term_pos = split(/,/, $terms_positions);
+
+					### FOR ALL POSITIONS 
+					$found_phrase = 0;
+					foreach my $f_pos (@first_pos) {
+						my $required_pos = $f_pos + $j;
+						#print "\$required_pos=$required_pos in (".join(',', @term_pos).") ??<br>";
+						if( grep(/^$required_pos$/, @term_pos) ) {
+							#print "#### MATCH<br>";
+							$found_phrase = 1;
+							last;
+						}
+					}
+					if( ! $found_phrase ) {
+						#print "####no match<br>";
+						last;
+					}
+					$j++;
+				}
+				#print "#### j ($j) gt ".scalar(@sub_terms)."....<br>";
+				if( $j > scalar(@sub_terms) ) {
+					# all sub terms have been found after the first term -> phrase match
+					push(@found_docs, $doc_id);
+				}
+			}
+
+			if( $prefix eq '+' ) {
+				if( $can_force ) {
+					# don't intersect the first time (would always build an empty set)
+					@force_docs = intersection(\@force_docs, \@found_docs);
+				} else {
+					@force_docs = @found_docs;
+					$can_force = 1;
+				}
+			} else {
+				#print "*******".join(',', @found_docs)."<br>";
+				foreach my $id (@found_docs) {
+					$other_docs{$id} = 1; 		# 1 is a fake value
+				}
+			}
+			$inside_quote = 0;
+			@sub_terms = ();
+			$prefix = "";
+    	} elsif( $inside_quote ) {
+			push(@sub_terms, $term);
+    	} else {
+			my $norm_term = normalize($term);
+			if ( ($mode eq 'all' && $term !~ m/^(\+|\-)/) || $term =~ /^\+/ ) {
+				# term is forced (mode='all')
+				# For "Match all words" just assume a "+" to every term that has no operator:
+				if ($terms_db{$norm_term}) {
+					push(@term_ids, $terms_db{$norm_term});
+					my $term_id = $terms_db{$norm_term};
+					my %v = unpack("S*", $inv_index_db{$term_id});
+					my @v = keys(%v);
+					#print "******$norm_term:".join(',', @v)."<br>";
+					if( $can_force ) {
+						# don't intersect the first time (would always build an empty set)
+						@force_docs = intersection(\@force_docs, \@v);
+					} else {
+						@force_docs = @v;
+						$can_force = 1;
+					}
+				} else {
+					return 0;    # this term was not found, we can stop already
+				}
+			} elsif ( $term =~ /^\-/ ) {
+				# exclude terms
+				if( $terms_db{$norm_term} ) {
+					my $term_id = $terms_db{$norm_term};
+					my %v = unpack("S*", $inv_index_db{$term_id});
+					# set union:
+					foreach my $doc_id (keys(%v)) {
+						$not_docs{$doc_id} = 1;		# 1 is a fake value
+					}
+				}
+			} else {
+				# term is optional (mode='any') or forced (mode='all')
+				if( $terms_db{$norm_term} ) {
+					push(@term_ids, $terms_db{$norm_term});
+					my $term_id = $terms_db{$norm_term};
+					my %v = unpack("S*", $inv_index_db{$term_id});
+					# set union:
+					foreach my $doc_id (keys(%v)) {
+						$other_docs{$doc_id} = 1;		# 1 is a fake value
+					}
+				}
+			}
+    	}
+    	$i++;
+	}
+
+	# hashtable = emulate a set:
+	my @other_docs = keys(%other_docs);
+	my @not_docs = keys(%not_docs);
+
+	if ( $mode ne 'all' ) {
+		@force_docs = (@force_docs, @other_docs);
+	}
+	@valid_docs = minus(\@force_docs, \@not_docs);
+
+	#print "###\@other_docs: <pre>".join(' / ', @other_docs)."</pre><br>";
+	#print "###\@force_docs: <pre>".join(' / ', @force_docs)."</pre><br>";
+	#print "###\@not_docs: <pre>".join(' / ', @not_docs)."</pre><br>";
+	#print "###\@valid_docs: <pre>".join(' / ', @valid_docs)."</pre><br>";
 
-  @valid_docs = minus(\@valid_docs, \@not_docs);
+	return @term_ids;
 }
 
 sub answer_query {
-  my @term_ids = (@force, @other);
+  my @term_ids = @_;
 
   my %valid_docs = ();
   map { $valid_docs{$_} = 1 } @valid_docs;
 
   foreach my $term_id (@term_ids) {
+	#  print "\$term_id=$term_id<br>";
     my %v = unpack('S*', $inv_index_db{$term_id});
     foreach my $doc_id (keys %v) {
       # optionally include only certain files:
@@ -333,6 +488,7 @@
   my $exclude = cleanup($query->param('exclude'));
   my $penalty = cleanup($query->param('penalty'));
   my $mode = cleanup($query->param('mode'));
+  my $sort = cleanup($query->param('sort'));
   my $q = cleanup($query->param('q'));
 
   my $file;
@@ -355,6 +511,7 @@
   $h{'include'} = $include;
   $h{'exclude'} = $exclude;
   $h{'penalty'} = $penalty;
+  $h{'sort'} = $sort;
   if( $mode eq 'all' ) {
     $h{'match_all'} = " selected=\"selected\"";
     $h{'match_any'} = "";
@@ -365,6 +522,7 @@
 
   if( scalar(@stopwords_ignored) > 0 ) {
     my $ignored_terms = join(" ", @stopwords_ignored);
+	#print "### '$ignored_terms':".scalar(@stopwords_ignored)."\n";
     $IGNORED_WORDS{$lang} =~ s/<WORDS>/$ignored_terms/gs;
     $h{'ignored_terms'} = $IGNORED_WORDS{$lang};
   } else {
@@ -399,7 +557,13 @@
     $percent_factor = 100/$max_score if( $max_score );
   }
   
-  foreach ((sort {$answer{$b} <=> $answer{$a}} keys %answer)[$first..$last]) {
+  my @keys;
+  if( defined($query->param('sort')) && $query->param('sort') eq 'title' ) {
+    @keys = sort {uc($titles_db{$a}) cmp uc($titles_db{$b})} (keys %answer);
+  } else {
+    @keys = sort {$answer{$b} <=> $answer{$a}} (keys %answer);
+  }
+  foreach (@keys[$first..$last]) {
     my $score = $answer{$_};
     if( $PERCENTAGE_RANKING ) {
       $score = sprintf("%.f", $score*$percent_factor);
@@ -447,12 +611,15 @@
   $lang = CGI::escape($lang);
   # Note: Keep order of arguments as in search_form.html to get correct visited link recognition:
   # Note that using "&amp;" is correct, "&" isn't. 
-  my $queries = "&amp;lang=$lang";
-  $queries .= "&amp;include=$include";
-  $queries .= "&amp;exclude=$exclude";
-  $queries .= "&amp;penalty=$penalty";
-  $queries .= "&amp;mode=$mode";
+  my $queries = "&amp;lang=".CGI::escape($lang);
+  $queries .= "&amp;include=".CGI::escape($include);
+  $queries .= "&amp;exclude=".CGI::escape($exclude);
+  $queries .= "&amp;penalty=".CGI::escape($penalty);
+  $queries .= "&amp;mode=".CGI::escape($mode);
   $queries .= "&amp;q=".CGI::escape($q);
+  if( defined($query->param('sort')) ) {
+    $queries .= "&amp;sort=".CGI::escape($query->param('sort'));
+  }
   if( $lang eq 'text' ) {
     # avoid warnings for $NEXT_PAGE{$lang}
     $lang = 'en';
@@ -539,6 +706,8 @@
   }
 
   $buffer =~ tr/a-zA-Z0-9_/ /cs;
+  $buffer =~ s/^\s+//;
+  $buffer =~ s/\s+$//;
   return lc $buffer;
 }
 
@@ -546,7 +715,10 @@
 # if $CONTEXT_SIZE is enabled:
 sub get_summary {
   my $id = $_[0];
-  my @terms = split(" ", normalize_special_chars($_[1]));
+  my $terms = $_[1];
+  $terms =~ s/&quot;//igs;
+  #print "ä### $terms #<br>";
+  my @terms = split(" ", normalize_special_chars($terms));
   # +/- operators aren't interesting here:
   foreach my $term (@terms) {
     $term =~ s/^(\+|\-)//;
@@ -555,7 +727,7 @@
   if( $CONTEXT_SIZE ) {
     $desc = get_context($content_db{$id}, @terms);
   }    
-  if( ! $desc ) {	# fixme: defined()
+  if( ! defined($desc) ) {
     $desc = $desc_db{$id};
     foreach my $term (@terms) {
       $desc = term_emphasize($desc, $term);
@@ -570,11 +742,16 @@
   my @terms = @_;
   my @contexts;
   foreach my $term (@terms) {
-    push(@contexts, get_context_for_term($buf, $term));
+    if( ! is_ignored(remove_accents(normalize_special_chars($term))) ) {
+      push(@contexts, get_context_for_term($buf, $term));
+    }
   }
   my $context = "";
   my $ct = 0;
   foreach my $result (@contexts) {
+    foreach my $term (@terms) {
+	  $result =~ s/\b($term)\b/<strong>$1<\/strong>/igs;
+    }
     $context .= "...".$result."...";
     $context .= "<br>" if( $ct < scalar(@contexts)-1 );
     $context .= "\n";
@@ -595,6 +772,7 @@
   my @desc_array = split(" ", $desc);
   my $last_prev = 0;
   foreach my $term_in_desc (@desc_array_normalized) {
+  	#print "****$term_in_desc / $term<br>";
     $ct++;
     $term_in_desc = normalize_special_chars($term_in_desc);
     $term_in_desc = remove_accents($term_in_desc);
@@ -637,7 +815,8 @@
   if( ! defined($str) ) {
     return "";
   }
-  $str =~ s/[<>"'&]//igs;
+  $str =~ s/[<>'&]/ /igs;
+  $str =~ s/"/&quot;/igs;
   return $str;
 }
 
@@ -708,7 +887,7 @@
     $check{$element} = 1;
   }
   foreach my $doc_id (@{$ra}) {
-    push @i, $doc_id if( ! $check{$doc_id} );	# fixme: defined
+    push @i, $doc_id if( ! $check{$doc_id} );
   }
   return @i;
 }
@@ -732,11 +911,13 @@
 # of the path (i.e. parts delimited by "/") on its own.
 sub my_uri_escape {
     my $str = shift;
-    my @parts = split("/", $str);
+    my @parts = split("(/)", $str);
     foreach my $part (@parts) {
-      $part = CGI::escape($part);
+      if( $part ne '/' ) {
+        $part = CGI::escape($part);
+      }
     }
-    $str = join("/", @parts);
+    $str = join("", @parts);
     return $str;
 }
 
@@ -754,4 +935,5 @@
   $zz = $HTTP_START_URL;
   $zz = $DATE_FORMAT;
   $zz = $HIGHLIGHT_TERMS;
+  $zz = $MINLENGTH;
 }
