#!/bin/perl5.005_03 # KSearch v1.4 # Copyright (C) 2000 David Kim (kscripts.com) # Parts of this script are Copyright # www.perlfect.com (C)2000 G.Zervas. All rights reserved # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or (at # your option) any later version. # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 # USA use Benchmark; # time search my $t0 = new Benchmark; use locale; use CGI; use CGI::Carp qw(fatalsToBrowser); use Fcntl; { $0 =~ /(.*)\//; push @INC, $1 if $1; } ###### You may have to add the full path to your configuration file below###### ############################################################################### require 'configuration/configuration.pl'; #CONFIGURATION PATH# my $usehash = 1; my $dbm_package; # To use the -T switch uncomment the next 2 lines and comment the following 11 line section # Note: You must have the DB_File perl module to run taint mode # and add ./ in front of the CONFIGURATION PATH below. #use DB_File; #$dbm_package = 'DB_File'; if ($USE_DBM) { package AnyDBM_File; @ISA = qw(DB_File GDBM_File SDBM_File ODBM_File NDBM_File) unless @ISA; for (@ISA) { if (eval "require $_") { if ($_ =~ /[SON]DBM_File/) { $usehash = 0; } $dbm_package = $_; last; } } package main; } my %f_file_db; # file path my %f_date_db; # file modification date my %f_size_db; # file size my %f_termcount_db; # number of non-space characters for score my %descriptions_db; # file description my %filenames_db; # file names my %titles_db; # file title my %contents_db; # file contents my %alt_text_db; # alt text my %meta_description_db; # meta descriptions my %meta_keyword_db; # meta keywords my %meta_author_db; # meta authors my %links_db; # links if ($USE_DBM) { tie %f_file_db, $dbm_package, $F_FILE_DB_FILE, O_RDONLY, 0755 or die "Cannot open $F_FILE_DB_FILE: $!"; tie %f_date_db, $dbm_package, $F_DATE_DB_FILE, O_RDONLY, 0755 or die "Cannot open $F_DATE_DB_FILE: $!"; tie %f_size_db, $dbm_package, $F_SIZE_DB_FILE, O_RDONLY, 0755 or die "Cannot open $F_SIZE_DB_FILE: $!"; tie %f_termcount_db, $dbm_package, $F_TERMCOUNT_DB_FILE, O_RDONLY, 0755 or die "Cannot open $F_TERMCOUNT_DB_FILE: $!"; tie %descriptions_db, $dbm_package, $DESCRIPTIONS_DB_FILE, O_RDONLY, 0755 or die "Cannot open $DESCRIPTIONS_DB_FILE: $!"; tie %titles_db, $dbm_package, $TITLES_DB_FILE, O_RDONLY, 0755 or die "Cannot open $TITLES_DB_FILE: $!"; tie %filenames_db, $dbm_package, $FILENAMES_DB_FILE, O_RDONLY, 0755 or die "Cannot open $FILENAMES_DB_FILE: $!"; if ($usehash) { # get contents from DBM if no key/value size limits tie %contents_db, $dbm_package, $CONTENTS_DB_FILE, O_RDONLY, 0755 or die "Cannot open $CONTENTS_DB_FILE: $!"; } if ($ALT_TEXT) { tie %alt_text_db, $dbm_package, $ALT_TEXT_DB_FILE, O_RDONLY, 0755 or die "Cannot open $ALT_TEXT_DB_FILE: $!"; } if ($META_DESCRIPTION) { tie %meta_description_db, $dbm_package, $META_DESCRIPTION_DB_FILE, O_RDONLY, 0755 or die "Cannot open $META_DESCRIPTION_DB_FILE: $!"; } if ($META_KEYWORD) { tie %meta_keyword_db, $dbm_package, $META_KEYWORD_DB_FILE, O_RDONLY, 0755 or die "Cannot open $META_KEYWORD_DB_FILE: $!"; } if ($META_AUTHOR) { tie %meta_author_db, $dbm_package, $META_AUTHOR_DB_FILE, O_RDONLY, 0755 or die "Cannot open $META_AUTHOR_DB_FILE: $!"; } if ($LINKS) { tie %links_db, $dbm_package, $LINKS_DB_FILE, O_RDONLY, 0755 or die "Cannot open $LINKS_DB_FILE: $!"; } } my $query = new CGI; my $html; # returned HTML page my $query_terms_copy; # query my $bare_query_terms; # original query my @terms; # terms/phrases my @checked_terms; # processed terms/phrases my %stopwords; # keys are stopterms in query my $stopwords_regex = ignore_terms(); # stopwords regular expression my $subsearch; # true if search within results my $search_within_results; # for subsearch loop my $previous_query; # previous queries my $previous_queries; # previous queries for subsearch to add to links in results page my @previous_queries; # previous queries for subsearch for loop my %previous_results; # previous results for subsearch loop my $whole_word; # true if search for whole words my $all; # true if search includes stop terms; my $case_sensitive; # true if case sensitive my $search_body; # true if search body my $search_title; # true if search titles my $search_meta_description; # true if search meta descriptions my $search_meta_keyword; # true if search meta keywords my $search_meta_author; # true if search meta authors my $search_alt_text; # true if search alt text my $search_links; # true if search links my $search_url; # true if search url my $add_plus; # if true, add + to all non +/- terms/phrases my @plusf; # +boolean terms/phrases for search my @minusf; # -boolean terms/phrases for search my @otherf; # other terms/phrases for search my @none; # +boolean terms/phrases without results my @final_files; # final files my %minus; # keys are files with -boolean term/phrase my %clean_body; my $delimitererror; my $score; # Score header my $weight_tip; # note to user about weights my $totalmatches; # total match count my $totalsize; # total size of all files with matches my @sortedanswers; # final list of sorted answers my %matches; # total matches for each file my %score_numerator; # characters that match x weights applied my %score_denominator; # total characters my %finalscores; # final score for each file # set sorting choice, results per page $SORT_BY = $query->param('sort') if ($query->param('sort') eq "Scores" || $query->param('sort') eq "Dates" || $query->param('sort') eq "Matches" || $query->param('sort') eq "Sizes" || $query->param('sort') eq "Titles" || $query->param('sort') eq "File Names"); $RESULTS_PER_PAGE = $query->param('display') if ($query->param('display') >= 5 && $query->param('display') <= 100); $show_matches = $query->param('showm'); # to search within previous results if ($SEARCH_RESULTS && $query->param('pq') !~ /^\s*$/ && $query->param($FORM_INPUT_NAME) !~ /^\s*$/ && $query->param('help') != 1) { $previous_queries = $query->param('pq').' '; @previous_queries = split " ", CGI::unescape($query->param('pq')); } print $query->header; start_search(); ##Subroutines############ sub start_search { my $query_terms; # initialize variables $score = 'Score:'; $totalmatches = ""; $totalsize = ""; $weight_tip = ""; @checked_terms = (); @plusf = (); @minusf = (); @otherf = (); @none = (); @final_files = (); %stopwords = (); %minus = (); %matches = (); %score_numerator = (); %score_denominator = (); %finalscores = (); $query_terms_copy = ""; $add_plus = ""; $all = ""; $whole_word = ""; $case_sensitive = ""; $search_title = ""; $search_meta_description = ""; $search_meta_keyword = ""; $search_meta_author = ""; $search_alt_text = ""; $search_body = ""; $search_links = ""; $search_url = ""; #if (@previous_queries && scalar@previous_queries < 7) { # to prevent looping too much if (@previous_queries) { # search results of previous queries $query_terms = shift @previous_queries; $subsearch = 1; } else { # search current query $query_terms = $query->param($FORM_INPUT_NAME); $query_terms =~ s/( )|( )/ /gs; # remove spaces $query_terms = translate_characters($query_terms); # ISO Latin approximations $bare_query_terms = $query_terms; # original query $query_terms = 'all:'.$query_terms if $query->param('all') == 1; $query_terms = 'c:'.$query_terms if ($query->param('c') eq "s" && $CASE_SENSITIVE); $query_terms = 'w:'.$query_terms if $query->param('w') == 1; $query_terms = 'st:'.$query_terms if ($query->param('st') == 1 && $ALL); unless ($query->param('default') == 1) { # search content options $query_terms = 'b:'.$query_terms if $query->param('b') == 1; $query_terms = 't:'.$query_terms if ($query->param('t') == 1); $query_terms = 'd:'.$query_terms if ($query->param('d') == 1 && $META_DESCRIPTION); $query_terms = 'k:'.$query_terms if ($query->param('k') == 1 && $META_KEYWORD); $query_terms = 'au:'.$query_terms if ($query->param('au') == 1 && $META_AUTHOR); $query_terms = 'alt:'.$query_terms if ($query->param('alt') == 1 && $ALT_TEXT); $query_terms = 'l:'.$query_terms if ($query->param('l') == 1 && $LINKS); $query_terms = 'u:'.$query_terms if ($query->param('u') == 1 && $URL); } $query_terms =~ s/^\s+//; $query_terms =~ s/\s+$//; $query_terms =~ s/\s+/ /g; $previous_query = $query_terms; # query with options for previous query option $subsearch = ""; } while ($query_terms =~ s/^(c|[0-9]+|score|date|match|size|title|name|b|t|d|k|au|alt|st|w|l|u|all)\://io) { my $option = $1; # let user add options directly in query text field $query_terms =~ s/^\s+//; if ($option =~ /^c$/i && $CASE_SENSITIVE) { $case_sensitive = 1; next; } if ($option =~ /^score$/i) { $SORT_BY = "Scores"; next; } if ($option =~ /^date$/i) { $SORT_BY = "Dates"; next; } if ($option =~ /^match$/i) { $SORT_BY = "Matches"; next; } if ($option =~ /^size$/i) { $SORT_BY = "Sizes"; next; } if ($option =~ /^title$/i) { $SORT_BY = "Titles"; next; } if ($option =~ /^name$/i) { $SORT_BY = "File Names"; next; } if ($option =~ /^b$/i) { $search_body = 1; next; } if ($option =~ /^t$/i) { $search_title = 1; next; } if ($option =~ /^d$/i && $META_DESCRIPTION) { $search_meta_description = 1; next; } if ($option =~ /^k$/i && $META_KEYWORD) { $search_meta_keyword = 1; next; } if ($option =~ /^au$/i && $META_AUTHOR) { $search_meta_author = 1; next; } if ($option =~ /^alt$/i && $ALT_TEXT) { $search_alt_text = 1; next; } if ($option =~ /^u$/i && $URL) { $search_url = 1; next; } if ($option =~ /^l$/i && $LINKS) { $search_links = 1; next; } if ($option =~ /^st$/i && $ALL) { $all = 1; next; } if ($option =~ /^w$/i) { $whole_word = 1; next; } if ($option =~ /^all$/i) { $add_plus = 1; next; } if ($option =~ /^([0-9]+)$/) { if ($option < 5) { $RESULTS_PER_PAGE = 5; } elsif ($option > 100) { $RESULTS_PER_PAGE = 100; } else { $RESULTS_PER_PAGE = $option; } } } returnresults() if ($query->param('help') == 1 || $query_terms =~ /^\s*$/); # return page if no query or for help if (!$search_title && !$search_meta_description && !$search_meta_keyword && !$search_meta_author && !$search_alt_text && !$search_body && !$search_links && !$search_url) { $show_matches = $SHOW_MATCHES; $search_body = 1; $search_title = 1; $search_meta_description = 1; # search body, title, and meta description as default } my @phrases; if ($DO_PHRASES) { # get phrases while ($query_terms =~ s/(\+<[0-9]+>)\"([^\"]*)\"/ /) { my $phrase = get_phrase($1,$2); push @phrases, $phrase if $phrase; } while ($query_terms =~ s/(<[0-9]+>)\"([^\"]*)\"/ /) { my $phrase = get_phrase($1,$2); push @phrases, $phrase if $phrase; } while ($query_terms =~ s/(\+?)\"([^\"]*)\"/ /) { my $phrase = get_phrase($1,$2); push @phrases, $phrase if $phrase; } } $query_terms =~ s/^\s+//; $query_terms =~ s/\s+$//; @terms = split /\s+/, $query_terms; # get terms push @terms, @phrases if $DO_PHRASES; # append phrases to terms array process_terms(); search_files() if (@otherf || @plusf || @minusf); process_booleans(); get_sorted_answers(); } sub get_phrase { my ($boolean, $phrase) = @_; $phrase =~ s/^\s+//; $phrase =~ s/\s+$//; return $boolean.$phrase if $phrase; } sub process_terms { # get terms and phrases and start search routine my %terms; foreach my $term (@terms) { my $cp = $term; my $cp_c; $cp =~ s/^\+// if $cp ne '+'; # remove + boolean if ($cp !~ /^<[0-9]+>$/ && $cp =~ m/^<([0-9]+)>/) { if ($1 >= 2 && $1 <= 10000 && $USER_WEIGHTS) { $cp =~ s/^<[0-9]+>//; # remove user defined weights } elsif ($cp =~ / / && $USER_WEIGHTS) { $weight_tip = "
Note: Scoring weights must be in the range of <2-10000>"; $cp =~ s/^<[0-9]+>//; # remove user defined weights } $cp_c = $cp; $cp = lc $cp if !$case_sensitive; next if exists $terms{$cp}; # skip repeats $terms{$cp} = undef; } else { $cp_c = $cp; $cp = lc $cp if !$case_sensitive; next if exists $terms{$cp}; # skip repeats $terms{$cp} = undef; $cp =~ s/^\-// if $cp ne '-'; # remove - boolean } unless ($all || $cp =~ /^\S+\*$/) { # ignore stop terms if (length $cp < $MIN_TERM_LENGTH || $cp =~ m/^$stopwords_regex$/io || $cp =~ m/^(<|>)$/) { $query_terms_copy .= "$cp_c "; $cp_c =~ s/^\-// if $cp_c ne '-'; # remove - boolean $stopwords{$cp_c} = undef; next; } } if ($term ne '+' && $term =~ s/^\+//) { @$term = (); push @plusf, $term; push @checked_terms, $cp_c; $query_terms_copy .= ($cp_c =~ / / ? "+\"$cp_c\" " : "+$cp_c "); } elsif ($term ne '-' && $term =~ s/^\-//) { push @minusf, $term; $query_terms_copy .= ($term =~ / / ? "-\"$term\" " : "-$term "); } else { if ($add_plus) { @$term = (); push @plusf, $term; push @checked_terms, $cp_c; $query_terms_copy .= ($cp_c =~ / / ? "+\"$cp_c\" " : "+$cp_c "); } else { push @otherf, $term; push @checked_terms, $cp_c; $query_terms_copy .= ($cp_c =~ / / ? "\"$cp_c\" " : "$cp_c "); } } } } sub search_files { if ($USE_DBM) { while (($file, $file_path) = each(%f_file_db)) { search_contents($file, $file_path); } } else { my $file_count = 0; open (FILEDB, $DATABASEFILE) || die "Can't open database file.\n"; foreach () { $file_count++; ($f_file_db{$file_count}, $filenames_db{$file_count}, $f_date_db{$file_count},$f_size_db{$file_count},$f_termcount_db{$file_count},$descriptions_db{$file_count},$titles_db{$file_count},$contents_db{$file_count},$alt_text_db{$file_count},$meta_description_db{$file_count},$meta_keywords_db{$file_count},$meta_author_db{$file_count},$links_db{$file_count}) = split /\t/, $_; my $filepath = $f_file_db{$file_count}; search_contents($file_count, $filepath); } close(FILEDB); } } sub search_contents { my $file = $_[0]; my $file_path = $_[1]; my $body; if ($search_body) { $score_denominator{$file} += $f_termcount_db{$file}; # add character count of body if ($SAVE_CONTENT) { # search pre-processed files in database (faster but uses disk space) if ($usehash) { # get contents from DBM if no size limits $body = $contents_db{$file}; } else { # otherwise get contents from separate files open (FILE,$DATABASE_DIR.$file) || die "Cannot open $DATABASE_DIR$file: $!"; $body = ; close (FILE); } } else { # search html file directly (slower but saves disk space) open (FILE,$INDEXER_START.$file_path) || die "Cannot open $INDEXER_START$file_path: $!"; my @LINES = ; close (FILE); $body = join ' ', @LINES; # must clean contents and search larger file (slow part) $body =~ s/(]*>.*?<\/script>)|(]*>.*?<\/style>)/ /gsi; $body =~ s/||<\/code>//gsi; $body =~ s/(<[^>]*>)|( )|( )/ /gs; # remove html poorly $body = translate_characters($body); # ISO Latin approximations $body =~ s/\s+/ /gs; $clean_body{$file} = $body if $show_matches; } } # add character counts for score if ($search_title) { my $title = $titles_db{$file}; $title =~ s/\s+//gs; $score_denominator{$file} += length $title; } if ($search_meta_description) { my $meta_descript = $meta_description_db{$file}; $meta_descript =~ s/\s+//gs; $score_denominator{$file} += length $meta_descript; } if ($search_meta_keyword) { my $meta_key = $meta_keyword_db{$file}; $meta_key =~ s/\s+//gs; $score_denominator{$file} += length $meta_key; } if ($search_meta_author) { my $meta_aut = $meta_author_db{$file}; $meta_aut =~ s/\s+//gs; $score_denominator{$file} += length $meta_aut; } if ($search_alt_text) { my $alt = $alt_text_db{$file}; $alt =~ s/\s+//gs; $score_denominator{$file} += length $alt; } if ($search_links) { my $links = $links_db{$file}; $links =~ s/\s+//gs; $score_denominator{$file} += length $links; } if ($search_url) { my $urltmp = $BASE_URL.$f_file_db{$file}; $urltmp =~ s/\s+//gs; $score_denominator{$file} += length $urltmp; } foreach my $term (@plusf) { # find +boolean terms/phrases my ($weight, $matches, $added); my $term_cp = $term; if ($term_cp =~ m/^<([0-9]+)>/ && $term_cp !~ /^<[0-9]+>$/) { if ($1 >= 2 && $1 <= 10000 && $USER_WEIGHTS) { $term_cp =~ s/^<([0-9]+)>//; # remove user defined weights $weight = $1; $SORT_BY = "Scores"; # Sort by scores if using weights $score = 'Weighted Score:'; } elsif ($term_cp =~ / / && $USER_WEIGHTS) { $term_cp =~ s/^<[0-9]+>//; # remove user defined weights } } $weight ||= 1; my $termcp = $term_cp; if ($search_body) { $matches = find_matches($body, $term_cp); if ($matches) { $matches{$file} += $matches; $termcp =~ s/\s+//g; # if it is a phrase $score_numerator{$file} += $matches * (length $termcp) * $weight; $$term{$file} = undef unless exists $$term{$file}; } } if ($search_title) { $matches = find_matches($titles_db{$file}, $term_cp); if ($matches) { $matches{$file} += $matches; $termcp =~ s/\s+//g; # if it is a phrase $score_numerator{$file} += $matches * (length $termcp) * $TITLE_WEIGHT * $weight; $$term{$file} = undef unless exists $$term{$file}; } } if ($search_meta_description) { $matches = find_matches($meta_description_db{$file}, $term_cp); if ($matches) { $matches{$file} += $matches; $termcp =~ s/\s+//g; # if it is a phrase $score_numerator{$file} += $matches * (length $termcp) * $META_DESCRIPTION_WEIGHT * $weight; $$term{$file} = undef unless exists $$term{$file}; } } if ($search_meta_keyword) { $matches = find_matches($meta_keyword_db{$file}, $term_cp); if ($matches) { $matches{$file} += $matches; $termcp =~ s/\s+//g; # if it is a phrase $score_numerator{$file} += $matches * (length $termcp) * $META_KEYWORD_WEIGHT * $weight; $$term{$file} = undef unless exists $$term{$file}; } } if ($search_meta_author) { $matches = find_matches($meta_author_db{$file}, $term_cp); if ($matches) { $matches{$file} += $matches; $termcp =~ s/\s+//g; # if it is a phrase $score_numerator{$file} += $matches * (length $termcp) * $weight; $$term{$file} = undef unless exists $$term{$file}; } } if ($search_alt_text) { $matches = find_matches($alt_text_db{$file}, $term_cp); if ($matches) { $matches{$file} += $matches; $termcp =~ s/\s+//g; # if it is a phrase $score_numerator{$file} += $matches * (length $termcp) * $weight; $$term{$file} = undef unless exists $$term{$file}; } } if ($search_links) { $matches = find_matches($links_db{$file}, $term_cp); if ($matches) { $matches{$file} += $matches; $termcp =~ s/\s+//g; # if it is a phrase $score_numerator{$file} += $matches * (length $termcp) * $weight; $$term{$file} = undef unless exists $$term{$file}; } } if ($search_url) { $matches = find_matches($BASE_URL.$f_file_db{$file}, $term_cp); if ($matches) { $matches{$file} += $matches; $term_cp =~ s/\s+//g; # if it is a phrase $score_numerator{$file} += $matches * (length $term_cp) * $weight; $$term{$file} = undef unless exists $$term{$file}; } } } foreach my $term (@minusf) { # skip files with -boolean terms/phrases if ($search_body) { if (find_matches($body, $term, 'minus')) { $minus{$file} = undef; return; } } if ($search_title) { if (find_matches($titles_db{$file}, $term, 'minus')) { $minus{$file} = undef; return; } } if ($search_meta_description) { if (find_matches($meta_description_db{$file}, $term, 'minus')) { $minus{$file} = undef; return; } } if ($search_meta_keyword) { if (find_matches($meta_keyword_db{$file}, $term, 'minus')) { $minus{$file} = undef; return; } } if ($search_meta_author) { if (find_matches($meta_author_db{$file}, $term, 'minus')) { $minus{$file} = undef; return; } } if ($search_alt_text) { if (find_matches($alt_text_db{$file}, $term, 'minus')) { $minus{$file} = undef; return; } } if ($search_links) { if (find_matches($links_db{$file}, $term, 'minus')) { $minus{$file} = undef; return; } } if ($search_url) { if (find_matches($BASE_URL.$f_file_db{$file}, $term, 'minus')) { $minus{$file} = undef; return; } } } foreach my $term (@otherf) { # find other terms/phrases my ($weight, $matches); my $term_cp = $term; if ($term_cp =~ m/<([0-9]+)>/ && $term_cp !~ /^<[0-9]+>$/) { if ($1 >= 2 && $1 <= 10000 && $USER_WEIGHTS) { $term_cp =~ s/^<([0-9]+)>//; # remove user defined weights $weight = $1; $SORT_BY = "Scores"; # Sort by scores if using weights $score = 'Weighted Score:'; } elsif ($term_cp =~ / / && $USER_WEIGHTS) { $term_cp =~ s/^<[0-9]+>//; # remove user defined weights } } $weight ||= 1; my $termcp = $term_cp; if ($search_body) { $matches = find_matches($body, $term_cp); if ($matches) { $matches{$file} += $matches; $termcp =~ s/\s+//g; # if it is a phrase $score_numerator{$file} += $matches * (length $termcp) * $weight; } } if ($search_title) { $matches = find_matches($titles_db{$file}, $term_cp); if ($matches) { $matches{$file} += $matches; $termcp =~ s/\s+//g; # if it is a phrase $score_numerator{$file} += $matches * (length $termcp) * $TITLE_WEIGHT * $weight; } } if ($search_meta_description) { $matches = find_matches($meta_description_db{$file}, $term_cp); if ($matches) { $matches{$file} += $matches; $termcp =~ s/\s+//g; # if it is a phrase $score_numerator{$file} += $matches * (length $termcp) * $META_DESCRIPTION_WEIGHT * $weight; } } if ($search_meta_keyword) { $matches = find_matches($meta_keyword_db{$file}, $term_cp); if ($matches) { $matches{$file} += $matches; $termcp =~ s/\s+//g; # if it is a phrase $score_numerator{$file} += $matches * (length $termcp) * $META_KEYWORD_WEIGHT * $weight; } } if ($search_meta_author) { $matches = find_matches($meta_author_db{$file}, $term_cp); if ($matches) { $matches{$file} += $matches; $termcp =~ s/\s+//g; # if it is a phrase $score_numerator{$file} += $matches * (length $termcp) * $weight; } } if ($search_alt_text) { $matches = find_matches($alt_text_db{$file}, $term_cp); if ($matches) { $matches{$file} += $matches; $termcp =~ s/\s+//g; # if it is a phrase $score_numerator{$file} += $matches * (length $termcp) * $weight; } } if ($search_links) { $matches = find_matches($links_db{$file}, $term_cp); if ($matches) { $matches{$file} += $matches; $termcp =~ s/\s+//g; # if it is a phrase $score_numerator{$file} += $matches * (length $termcp) * $weight; } } if ($search_url) { $matches = find_matches($BASE_URL.$f_file_db{$file}, $term_cp); if ($matches) { $matches{$file} += $matches; $termcp =~ s/\s+//g; # if it is a phrase $score_numerator{$file} += $matches * (length $termcp) * $weight; } } } } sub process_booleans { my $noplus; foreach my $term (@plusf) { # first check if matches exist for each + term if (!%$term) { $noplus = 1; my $term_cp = $term; if ($term_cp =~ m/^<([0-9]+)>/ && $term_cp !~ /^<[0-9]+>$/) { if ($1 >= 2 && $1 <= 10000 && $USER_WEIGHTS) { $term_cp =~ s/^<([0-9]+)>//; # remove user defined weights } elsif ($term_cp =~ / / && $USER_WEIGHTS) { $term_cp =~ s/^<[0-9]+>//; # remove user defined weights } } # if there no files with +boolean term if ($term_cp =~ / /) { # if it is a phrase push @none, '"'.$term_cp.'"'; } else { # if it is a term push @none, $term_cp; } } } if (!$noplus) { # if all + terms have matches find intersection my ($i, $si ) = ( 0, scalar keys %{ $plusf[0] }); my ($j, $sj ); for ( $j= 1; $j < @plusf; $j++ ) { # find smallest hash first $sj = scalar keys %{ $plusf[ $j ] }; ( $i, $si ) = ( $j, $sj ) if $sj < $si; } my ( $hashvalue, %intersection ); NEXTHASH: # Check each hash against remaining ones foreach $hashvalue ( keys %{ splice @plusf, $i, 1 } ) { foreach ( @plusf ) { next NEXTHASH unless exists $$_{ $hashvalue }; } $intersection{ $hashvalue } = undef; } @final_files = ( keys %intersection ); } } sub get_sorted_answers { if (@none || (!@final_files && (@plusf))) { # if there are no results returnresults(); } @final_files = keys %matches if !@final_files; # get files with matches foreach my $answer (@final_files) { # get final answers unless (exists $minus{$answer}) { # remove files with -boolean terms/phrases if ($search_within_results) { # search within previous results if option is chosen unless (exists $previous_results{$answer}) { next; } } if ($score_denominator{$answer} != 0) { $finalscores{$answer} = sprintf("%.2f", 100*($score_numerator{$answer}/$score_denominator{$answer})); } else { $finalscores{$answer} = "n/a"; } $totalmatches += $matches{$answer}; $totalsize += $f_size_db{$answer}; } } if ($subsearch) { # loop through results of each previous query to search within results $search_within_results = 1; %previous_results = %finalscores; start_search(); return; } if ($SORT_BY eq "Matches") { # sort answers @sortedanswers = sort {$matches{$b} <=> $matches{$a} || $finalscores{$b} <=> $finalscores{$a} || $f_date_db{$b} <=> $f_date_db{$a} || $f_size_db{$b} <=> $f_size_db{$a} || lc($titles_db{$a}) cmp lc($titles_db{$b}) || lc($filenames_db{$a}) cmp lc($filenames_db{$b}) } keys %finalscores; } elsif ($SORT_BY eq "Scores") { @sortedanswers = sort {$finalscores{$b} <=> $finalscores{$a} || $matches{$b} <=> $matches{$a} || $f_date_db{$b} <=> $f_date_db{$a} || $f_size_db{$b} <=> $f_size_db{$a} || lc($titles_db{$a}) cmp lc($titles_db{$b}) || lc($filenames_db{$a}) cmp lc($filenames_db{$b}) } keys %finalscores; } elsif ($SORT_BY eq "Dates") { @sortedanswers = sort {$f_date_db{$b} <=> $f_date_db{$a} || $matches{$b} <=> $matches{$a} || $finalscores{$b} <=> $finalscores{$a} || $f_size_db{$b} <=> $f_size_db{$a} || lc($titles_db{$a}) cmp lc($titles_db{$b}) || lc($filenames_db{$a}) cmp lc($filenames_db{$b}) } keys %finalscores; } elsif ($SORT_BY eq "Sizes") { @sortedanswers = sort {$f_size_db{$b} <=> $f_size_db{$a} || $matches{$b} <=> $matches{$a} || $finalscores{$b} <=> $finalscores{$a} || $f_date_db{$b} <=> $f_date_db{$a} || lc($titles_db{$a}) cmp lc($titles_db{$b}) || lc($filenames_db{$a}) cmp lc($filenames_db{$b}) } keys %finalscores; } elsif ($SORT_BY eq "Titles") { @sortedanswers = sort {lc($titles_db{$a}) cmp lc($titles_db{$b}) || lc($filenames_db{$a}) cmp lc($filenames_db{$b}) || $matches{$b} <=> $matches{$a} || $finalscores{$b} <=> $finalscores{$a} || $f_date_db{$b} <=> $f_date_db{$a} || $f_size_db{$b} <=> $f_size_db{$a} } keys %finalscores; } else { @sortedanswers = sort {lc($filenames_db{$a}) cmp lc($filenames_db{$b}) || lc($titles_db{$a}) cmp lc($titles_db{$b}) || $matches{$b} <=> $matches{$a} || $finalscores{$b} <=> $finalscores{$a} || $f_date_db{$b} <=> $f_date_db{$a} || $f_size_db{$b} <=> $f_size_db{$a} } keys %finalscores; } returnresults(); } sub returnresults { # creates HTML page from template file my %h; my ($options, $sortby, $casesearch, $commonterms, $subsearch_string, $subsearch_info); my $rank = 0; $query_terms_copy =~ s/\s$//; my $bare_query = $query_terms_copy; my $query_str = CGI::escape($bare_query_terms); my $previous_query_str = CGI::escape($previous_query); $bare_query_terms =~ s/\"/\"\;/g; $h{query_str} = $bare_query_terms; $h{version} = $VERSION; $h{search_url} = $SEARCH_URL; $h{input_name} = $FORM_INPUT_NAME; $html = get_template($KSEARCH_TEMPLATE); my $results = @sortedanswers; my $currentpage = $query->param('p'); $currentpage ||= 1; if ($SEARCH_RESULTS && $query->param('pq') !~ /^\s*$/) { $subsearch_string = '&pq='.CGI::escape($query->param('pq')); $subsearch_info = ' from previous results'; } my ($search_sources, $search_options); #### Search form options format #### if ($add_plus) { $options .= '&all=1'; $h{add_plus} = '+'; } else { $h{add_plus} = '+'; } $h{default} = 'Default'; if ($case_sensitive) { $options .= '&c=s'; $casesearch = 'case sensitive'; $h{c} = 'Case Sensitive'; } else { $casesearch = ""; $h{c} = 'Case Sensitive' if $CASE_SENSITIVE; } if ($show_matches && $SHOW_MATCHES) { $options .= "\&showm=$show_matches"; $h{show_matches} = 'Show Matches in Descriptions'; } elsif ($SHOW_MATCHES) { $h{show_matches} = 'Show Matches in Descriptions'; } else { $h{show_matches} = ""; } if ($whole_word) { $options .= '&w=1'; $h{w} = 'Whole Words Only'; } else { $h{w} = 'Whole Words Only'; } if ($all) { $options .= '&st=1'; $h{st} = 'Include Stop-Terms'; } else { $h{st} = 'Include Stop-Terms' if $ALL; } if ($search_body) { $search_sources .= " Body,"; $options .= '&b=1'; $h{b} = 'Body'; } else { $h{b} = 'Body'; } if ($search_title) { $search_sources .= " Title,"; $options .= '&t=1'; $h{t} = 'Title'; } else { $h{t} = 'Title'; } if ($search_meta_description) { $search_sources .= " Meta-Description,"; $options .= '&d=1'; $h{d} = 'Meta-Description'; } else { $h{d} = 'Meta-Description' if $META_DESCRIPTION; } if ($search_meta_keyword) { $search_sources .= " Meta-Keywords,"; $options .= '&k=1'; $h{k} = 'Meta-Keywords'; } else { $h{k} = 'Meta-Keywords' if $META_KEYWORD; } if ($search_meta_author) { $search_sources .= " Meta-Authors,"; $options .= '&au=1'; $h{au} = 'Meta-Authors'; } else { $h{au} = 'Meta-Authors' if $META_AUTHOR; } if ($search_alt_text) { $search_sources .= " Alt-Text,"; $options .= '&alt=1'; $h{alt} = 'Alt-Text'; } else { $h{alt} = 'Alt-Text' if $ALT_TEXT; } if ($search_links) { $options .= '&l=1'; $search_sources .= " Links,"; $h{l} = 'Links'; } else { $h{l} = 'Links' if $LINKS; } if ($search_url) { $search_sources .= " Url,"; $options .= '&u=1'; $h{u} = 'Url'; } else { $h{u} = 'Url' if $URL; } $search_sources =~ s/,$//; if ($SORT_BY eq "Matches") { $h{sort} = '