--- indexer.pl	Thu Sep 14 20:06:18 2000
+++ indexer1.pl	Mon Oct  9 11:58:29 2000
@@ -42,7 +42,7 @@
 }
 
 package main;
-require 'conf.pl';
+require 'conf1.pl';		# MarkMLl
 require 'SGMLEntities.pl';
 
 $|=1;
@@ -75,6 +75,7 @@
 print "Loading stopwords...";
 my $stopwords_regex = load_stopwords();
 print "Done.\n";
+my $extensions_regex = join '|', @EXT;		# MarkMLl
 
 print "Starting crawler...\n";
 crawl($DOCUMENT_ROOT.$INDEXER_START_DIR);
@@ -129,7 +130,12 @@
   closedir(DIR);
 
   my @dirs  = grep {-d and not /^\.{1,2}$/} @contents; 
-  my @files = grep {-f and /^.+\.(.+)$/ and grep {/^\Q$1\E$/} @EXT} @contents;
+#  my @files = grep {-f and /^.+\.(.+)$/ and grep {/^\Q$1\E$/} @EXT} @contents;
+  my @files = grep {-f and /^(?!\.)([^.]*(?<!~)|.*\.(?:$extensions_regex))$/o} @contents; 
+
+# Above change to allow blank extension and exclude files starting with .
+# or (where there is no extension) ending with ~. MarkMLl, courtesy of
+# Hugo van der Sanden (hv@crypt0.demon.co.uk).
   
   FILE: foreach my $f (@files) {
     $file = $dir."/".$f;
@@ -231,7 +237,10 @@
   my $buffer = $_[0];
 
   $buffer =~ s/<!--.*?-->//gis;  # strip html comments
-  $buffer =~ s/-(\s*\n\s*)?//g;  # join parts of hyphenated words
+#  $buffer =~ s/-(\s*\n\s*)?//g;  # join parts of hyphenated words
+
+# The line above discards all hyphens. However, if we want to be able
+# to index telephone numbers it's worth deferring this. MarkMLl.
 
   if( $SPECIAL_CHARACTERS ) {
     # There may be special characters that are not encoded, so encode them:
@@ -248,10 +257,27 @@
     $buffer =~ s/&(..?)(grave|acute|circ|tilde|uml|ring|cedil|slash|lig);/$1/igs;
   }
 
+# Block below modified such that if we want to index numbers particularly
+# telephone numbers - is converted to a space if there are three or more digits
+# on one side and one or more on the other, else it is discarded. Pretty horrid
+# to be honest, getting the regex to apply correctly where there were several
+# hyphens in a number e.g. 1-59592-149-6 turned out to be tricky and I've moved
+# the HTML tag eliminator to the start of the block to get rid of spurious <br>s.
+
+# Note to permanent maintainers: feel free to remove my moniker, it's just
+# here so I can find changes quickly.
+
   if ($INDEX_NUMBERS) {
-    $buffer =~ s/(<[^>]*>)/ /gs;
+    $buffer =~ s/(<[^>]*>)/ /gs;			# MarkMLl
+    my $enDash = ($buffer =~ /\d-\d/);			# MarkMLl
+    while ($enDash) {					# MarkMLl
+      $enDash = ($buffer =~ s/(\d{1,})-(?:\s*\n\s*)?(\d{3,})/$1 $2/gs); # MarkMLl
+      $enDash += ($buffer =~ s/(\d{3,})-(?:\s*\n\s*)?(\d{1,})/$1 $2/gs) # MarkMLl
+    }							# MarkMLl
+    $buffer =~ s/-(\s*\n\s*)?//gs; 			# MarkMLl
   } else {
-    $buffer =~ s/(\b\d+\b)|(<[^>]*>)/ /gs;
+    $buffer =~ s/(\b\d+\b)|(<[^>]*>)/ /gs;		# MarkMLl
+    $buffer =~ s/-(\s*\n\s*)?//gs;  # join parts of hyphenated words MarkMLl
   }
 
   $buffer =~ s/$stopwords_regex//gio;
--- conf.pl	Thu Sep 14 18:00:21 2000
+++ conf1.pl	Sun Sep 24 18:39:45 2000
@@ -1,5 +1,7 @@
 # Perlfect Search Configuration file
 
+# Modified to allow multiple scripts (search.pl, search1.pl) and directories. MarkMLl
+
 # NOTE: Whenever you change one of the options that's marked with [re-index] you
 # need to run indexer.pl again to make the change take effect.
 
@@ -16,13 +18,14 @@
 $INSTALL_DIR = '/var/lib/apache/share/cgi-bin/perlfect/search/';
 
 # Only files with these extensions should be indexed. [re-index]
-@EXT = ("htm","html","shtml","txt");
+# Now work correctly with regexes and blank entries. MarkMLl.
+@EXT = ("html?","shtml","txt","");
 
 # How many results should be shown per page.
 $RESULTS_PER_PAGE = 10;
 
 # Do you want to index numbers? If so set $INDEX_NUMBERS to 1. [re-index]
-$INDEX_NUMBERS = 0;
+$INDEX_NUMBERS = 1;
 
 # How many words should be used from the <BODY> of an html document as a description
 # for the document in case there is no <META description> tag available. [re-index]
@@ -72,7 +75,7 @@
 $PREV_PAGE{'de'} = 'vorige Seite';
 
 # You shouldn't have to edit anything below this line.
-$DATA_DIR = $INSTALL_DIR.'data/';
+$DATA_DIR = $INSTALL_DIR.'data1/';		# MarkMLl
 $INV_INDEX_DB_FILE = $DATA_DIR.'inv_index';
 $DOCS_DB_FILE      = $DATA_DIR.'docs';
 $TERMS_DB_FILE     = $DATA_DIR.'terms';
@@ -87,11 +90,11 @@
 $DESC_TMP_DB_FILE      = $DATA_DIR.'desc_tmp';
 $TITLES_TMP_DB_FILE    = $DATA_DIR.'titles_tmp';
 
-$CONF_DIR = $INSTALL_DIR."conf/";
+$CONF_DIR = $INSTALL_DIR."conf1/";		# MarkMLl
 $STOPWORDS_FILE = $CONF_DIR.'stopwords.txt';
 $NO_INDEX_FILE = $CONF_DIR.'no_index.txt';
 
-$SEARCH = 'search.pl';
+$SEARCH = 'search1.pl';				# MarkMLl
 $SEARCH_URL = $CGIBIN.$SEARCH;
 
 $VERSION = "3.09";
