#!/usr/bin/perl # # SWISH::Prog-based Swish3 implementation # use strict; use warnings; use Carp; use SWISH::Prog; use SWISH::3; use Getopt::Long qw(:config no_ignore_case); use Data::Dump qw( dump ); use Rose::DateTime::Util qw(parse_date); use Time::HiRes; use Search::Tools::UTF8; binmode STDERR, ':utf8'; binmode STDOUT, ':utf8'; my $VERSION = '3.0.12'; my $USAGE = qq{$0 usage: swish3 [-E N] [-i dir file ... ] [-S aggregator] [-c file] [-f invindex] [-l] [-v (num)] swish3 -w word1 word2 ... [-f file1 file2 ...] \ [-P phrase_delimiter] [-p prop1 ...] [-s sortprop1 [asc|desc] ...] \ [-m num] [-t str] [-d delim] [-H (num)] [-x output_format] \ [-R rank_scheme] [-L prop low high] swish3 -k (char|*) [-f invindex1 invindex2 ...] swish3 -M invindex1 ... outputfile swish3 -N path/to/compare/file or date swish3 -V swish3 -h options: defaults are in brackets # commented options are not yet supported indexing options: -c : configuration file -D : Debug mode -E : next param is total expected files (generates progress bar) -f : invindex dir to create or search from [index.swish3] -F : next param is invindex format (ks, lucy, xapian, native, or dbi) [native] -i : create an index from the specified files for "-S fs" - specify a list of files or directories for "-S spider" - specify a list of URLs -h : print this usage statement -l : follow symbolic links when indexing -M : merges index files -N : index only files with a modification date newer than path or date if no argument, defaults to [indexdir/swish_last_start] -S : specify which aggregator to use. Valid options are: "fs" - local files in your File System "spider" - web site files using a web crawler #"prog" - use the program API The default value is: "fs" #-T : Trace options ('-T help' for info) -W : next param is ParserWarnLevel [-W 2] search options: -b : begin results at this number #-d : next param is delimiter. #-E : Append errors to file specified, or stderr if file not specified. #-e : "Economic Mode": The index proccess uses less RAM. -f : invindex dir to create or search from [index.swish3] -F : next param is invindex format (ks, lucy, xapian, native, or dbi) [native] -h : print this usage statement -H : "Result Header Output": verbosity (0 to 9) [1]. #-k : Print words starting with a given char. -L : Limit results to a range of property values -m : the maximum number of results to return [defaults to all results] #-P : next param is Phrase delimiter. #-p : include these document properties in the output "prop1 prop2 ..." -R : next param is Rank Scheme number (0 to 1) [0]. -s : sort by these document properties in the output "prop1 prop2 ..." #-T : Trace options ('-T help' for info) #-t : tags to search in - specify as a string # "HBthec" - in Head|Body|title|header|emphasized|comments -V : prints the current version -v : indexing verbosity level (0 to 3) [-v 1] -w : search for words "word1 word2 ..." -x : "Extended Output Format": Specify the output format. swish3 version : $VERSION SWISH::Prog::VERSION : $SWISH::Prog::VERSION docs : http://swish-e.org/swish3/ }; my $Opt = { help => 0, verbose => undef, Version => 0, Debug => 0, config => '', Warnings => 2, Source => 'fs', Format => 'native', query => '', input => 0, folder => undef, begin => 0, max => undef, test_mode => $ENV{SWISH_TEST} || 0, extended_output => undef, Limit => [], newer_than => undef, links => 0, Merge => undef, Headers => 1, sort_order => '', Expected => undef, lucy_highlightable => 0, }; my %allopts = ( 'help' => 'print usage', 'config=s' => 'config file', 'verbose:i' => 'be verbose', 'debug|Debug' => 'debugging', 'Warnings=i' => 'print libxml2 warnings', 'aggregator|Source=s' => 'aggregator type', 'Format=s' => 'indexer type (native, ks, lucy, xapian, dbi)', 'query|words=s' => 'search query', 'input' => 'indexing mode', 'folder=s' => 'invindex dir', 'begin=i' => 'begin results [0]', 'max=i' => 'max results [all]', 'Version' => 'print Version', 'test_mode' => 'set with SWISH_TEST env var', 'extended_output|x=s' => 'extended output format string', 'Limit=s' => 'Limit to range of values', 'newer_than|N:s' => 'index only documents newer than', 'links' => 'follow symbolic links when indexing', 'Merge' => 'merge 2 or more indexes', 'Headers=i' => 'Header output verbosity', 'RankScheme=i' => 'ranking algorithm (native only)', 'sort_order=s' => 'result sort order', 'Expected=i' => 'expected total docs (generates progress bar)', 'lucy_highlightable' => 'set highlightable_fields for Lucy indexer', ); my %native_only_opts = ( 'RankScheme' => 1, ); GetOptions( $Opt, keys %allopts ) or die $USAGE; if ( $Opt->{Version} ) { print "$0 $VERSION\n"; exit; } if ( $Opt->{help} ) { print $USAGE; exit; } for my $opt ( keys %$Opt ) { if ( lc( $Opt->{Format} ) ne 'native' and exists $native_only_opts{$opt} ) { ( my $letter = $opt ) =~ s/^(.).+/$1/; print "-$letter (--$opt) is a native-only format option\n"; exit 1; } } #croak $USAGE unless @ARGV; if ( $Opt->{input} ) { $Opt->{indexer} = $Opt->{Format}; } $Opt->{invindex} = $Opt->{folder}; $Opt->{debug} and Data::Dump::dump $Opt; if ( defined $Opt->{verbose} and $Opt->{verbose} == 0 ) { $Opt->{verbose} = 1; } if ( !exists $ENV{SWISH_WARNINGS} ) { $ENV{SWISH_WARNINGS} = $Opt->{Warnings}; } my %prog_can = ( highlightable_fields => $Opt->{lucy_highlightable}, ); for ( keys %$Opt ) { if ( SWISH::Prog->can($_) ) { $prog_can{$_} = $Opt->{$_}; } } if ( $Opt->{input} ) { my $prog = SWISH::Prog->new(%prog_can); if ( defined $Opt->{newer_than} ) { if ( !length $Opt->{newer_than} ) { $Opt->{newer_than} = $prog->indexer->invindex->path->file('swish_last_start'); } # if it's a file, stat it, # otherwise convert to timestamp my $ts; my $dt = parse_date( $Opt->{newer_than} ); if ( !defined $dt ) { my $stat = [ stat( $Opt->{newer_than} ) ]; if ( !defined $stat->[9] ) { croak "-N option must be a valid date string or a readable file: $!"; } $ts = $stat->[9]; } else { $ts = $dt->epoch; } $prog->aggregator->set_ok_if_newer_than($ts); printf "Skipping documents older than %s\n", scalar localtime($ts); } if ( $Opt->{links} and $Opt->{Source} eq 'fs' ) { $prog->aggregator->config->FollowSymLinks(1); } if ( $Opt->{Expected} ) { $prog->aggregator->progress( progress_bar( $Opt->{Expected} ) ); } $prog->indexer->invindex->path->mkpath( $Opt->{debug} ); $prog->indexer->invindex->path->file('swish_last_start')->touch(); my $start = time(); my $num_docs = $prog->index(@ARGV); my $end = time(); my $elapsed = $end - $start; printf( "%d documents in %s\n", ( $num_docs || 0 ), secs2hms($elapsed) ); } elsif ( $Opt->{query} ) { my $invindex = SWISH::Prog::InvIndex->new( path => $Opt->{invindex} ); my $meta = $invindex->meta; my $format = $meta->Index->{Format}; my $sclass = "SWISH::Prog::${format}::Searcher"; eval "require $sclass"; croak $@ if $@; my $searcher = $sclass->new( invindex => $invindex->path . '', ); my $start_time = Time::HiRes::time(); my $results = $searcher->search( to_utf8($Opt->{query}), { start => $Opt->{begin}, max => $Opt->{max}, limit => parse_limits( $Opt->{Limit} ), rank_scheme => $Opt->{RankScheme}, order => $Opt->{sort_order}, } ); my $search_time = Time::HiRes::time() - $start_time; if ( $Opt->{Headers} ) { printf( "# swish3 version %s\n", $VERSION ); printf( "# Format: %s\n", $format ); printf( "# Query: %s\n", to_utf8($Opt->{query}) ); printf( "# Hits: %d\n", $results->hits ); printf( "# Search time: %.4f\n", $search_time ); } if ( $Opt->{Headers} > 1 ) { printf( "# Parsed Query: %s\n", $results->query ); } my ( $output_format, $output_format_str ); if ( $Opt->{extended_output} ) { my @props; my $default_properties = SWISH::3::SWISH_DOC_PROP_MAP(); while ( $Opt->{extended_output} =~ m/<(.+?)>/g ) { my $p = $1; if ( !exists $meta->PropertyNames->{$p} and !exists $default_properties->{$p} and $p ne 'swishtitle' and $p ne 'swishdescription' and $p ne 'swishrank' ) { die "Invalid PropertyName: $p\n"; } else { push @props, $p; } } $output_format_str = $Opt->{extended_output}; for my $prop (@props) { $output_format_str =~ s/<$prop>/\%s/g; # TODO ints and dates } # make escaped chars work $output_format_str =~ s/\\n/\n/g; $output_format_str =~ s/\\t/\t/g; $output_format_str =~ s/\\r/\r/g; $output_format = \@props; #warn "str: $output_format_str\n"; #warn dump $output_format; } my $counter = 0; while ( my $result = $results->next ) { if ($output_format) { my @res; for my $prop (@$output_format) { my $val; if ( $prop eq 'swishrank' ) { $val = $result->score; } else { $val = $result->get_property($prop); } $val = '' unless defined $val; $val =~ s/\003/\\x{03}/g; push( @res, to_utf8($val) ); } printf( $output_format_str, @res ); } else { printf( qq{%4d %s "%s"\n}, $result->score, $result->uri, $result->title ); } if ( $Opt->{max} ) { last if ++$counter >= $Opt->{max}; } } print ".\n"; } elsif ( $Opt->{Merge} ) { my @indexes = @ARGV; my $num = scalar(@indexes); if ( $num < 2 ) { die "-M requires minimum of 2 arguments\n"; } if ( $Opt->{Format} eq 'native' ) { require SWISH::Prog::Native::Indexer; my $target_invindex = pop(@indexes); my $indexer = SWISH::Prog::Native::Indexer->new( invindex => $target_invindex, debug => $Opt->{debug}, verbose => $Opt->{verbose}, ); if ( !$indexer->invindex->meta ) { die "$target_invindex missing swish.xml header file"; } $indexer->merge( map { SWISH::Prog::Native::InvIndex->new( path => $_ ) } @indexes ); } else { die "InvIndex Format $Opt->{Format} not supported for -M option\n"; } } elsif ( $Opt->{Version} ) { print "$0 $VERSION\n"; } else { print $USAGE; } exit; sub secs2hms { my $secs = shift || 0; my $hours = int( $secs / 3600 ); my $rm = $secs % 3600; my $min = int( $rm / 60 ); my $sec = $rm % 60; return sprintf( "%02d:%02d:%02d", $hours, $min, $sec ); } sub parse_limits { my $limits = shift or return; if ( !@$limits ) { return $limits; } my @parsed; for my $lim (@$limits) { push @parsed, [ split( /\s+/, $lim ) ]; } return \@parsed; } sub progress_bar { require Term::ProgressBar; my $total = shift; my $tpb = Term::ProgressBar->new( { ETA => 'linear', name => 'swish3', count => $total, } ); return $tpb; } __END__ =pod =head1 NAME swish3 - SWISH::Prog-based Swish3 implementation =head1 SYNOPSIS swish3 [-E N] [-i dir file ... ] [-S aggregator] [-c file] [-f invindex] [-l] [-v (num)] swish3 -w word1 word2 ... [-f file1 file2 ...] \ [-P phrase_delimiter] [-p prop1 ...] [-s sortprop1 [asc|desc] ...] \ [-m num] [-t str] [-d delim] [-H (num)] [-x output_format] \ [-R rank_scheme] [-L prop low high] swish3 -k (char|*) [-f invindex1 invindex2 ...] swish3 -M invindex1 invindex2 ... outputfile swish3 -N path/to/compare/file or date swish3 -V NOTE THAT SOME OPTIONS ARE NOT YET SUPPORTED. Type: % swish3 -h =head1 DESCRIPTION B is one example of implementing the Swish3 API. B is written in Perl using the SWISH::Prog framework. Type: % swish3 -h for a complete list of options. =head1 AUTHOR Peter Karman, Eperl@peknet.comE =head1 COPYRIGHT AND LICENSE Copyright 2009-2010 by Peter Karman This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO http://swish-e.org/swish3/ =cut