#!@@perlbinary@@ -w use strict; # This is set to where Swish-e's "make install" installed the helper modules. use lib ( '@@perlmoduledir@@' ); ################################################################################### # # Copyright (C) 2001 Bill Moseley swishscript@hank.org # Program to test the SWISH::Filter module # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version # 2 of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # The above lines must remain at the top of this program # # $Id$ # #################################################################################### use Getopt::Long; use SWISH::Filter; use Pod::Usage; use URI; use constant ABORT => 0; use constant DEBUG => 1; use constant INFO => 2; my ( $verbose, $show_content, @file, @url, $help, $man, $quiet, $headers, $path, $depreciated, $mimetypes); my $skip_binary = 1; my $lines = 10; my $max_chars = 1000; GetOptions( 'verbose!' => \$verbose, # turn on INFO messages 'content!' => \$show_content, 'quiet' => \$quiet, 'lines=i' => \$lines, 'help|?' => \$help, 'man' => \$man, 'headers' => \$headers, 'skip_binary!' => \$skip_binary, 'path' => \$path, 'depreciated'=> \$depreciated, 'mimetypes' => \$mimetypes, ) || pod2usage(2); pod2usage( -verbose => 1 ) if $help; pod2usage( -verbose => 2 ) if $man; if ( $path ) { print '@@perlmoduledir@@',"\n"; exit; } pod2usage( -verbose => 0, -message => "Must specify a file or URL", -exitvar => 1, ) unless @ARGV or $mimetypes; $ENV{FILTER_DEBUG} = 1 if $verbose; # used by SWISH::Filter msg(INFO, "SWISH::Filter found at [%s]\n", $INC{'SWISH/Filter.pm'} ); my $filter = SWISH::Filter->new; if ( $mimetypes ) { my @filters = $filter->filter_list; print "Mimetypes:\n\n"; for my $filter ( @filters ) { print " $filter:\n"; for my $pattern ( $filter->mimetypes ) { print" $pattern\n"; } print "\n"; } } my $return = 0; for my $doc ( @ARGV ) { eval { $depreciated ? process_doc_old( $doc ) : process_doc( $doc ) }; $return = 1 if $@; warn "** $0:\n $@\n" if $@; # always warn on die } exit $return; sub process_doc { my ($file) = @_; my $uri; eval { $uri = URI->new( $file ) }; my %config = !$@ && $uri->scheme ? fetch_url( $file ) : fetch_file( $file); my $doc = $filter->convert( %config, name => $file, ); die "Failed to process document [$file]\n" unless $doc; my $content_type = $doc->content_type || "unknown"; my $parser_type = $doc->swish_parser_type || ''; my $binary = $doc->is_binary; my $msg = $doc->was_filtered ? '' : 'not'; my $name = $doc->name; msg(DEBUG, <filters_used ) { for my $filter ( @$filters_used ) { msg( DEBUG, " >Filter used: $filter->{name} ( $filter->{start_content_type} -> $filter->{end_content_type} )" ); } } if ( !$binary ) { my @doc = split /\n/, substr( ${$doc->fetch_doc}, 0, $max_chars ); $lines = @doc-1 if $lines >= @doc; msg(INFO, join "\n", '-- Output Content Sample --', @doc[0..$lines],'','-- end --','' ); } die "Skipping binary [$file]\n" if $binary && $skip_binary; if ($headers ) { my $len = length ${$doc->fetch_doc}; print "Path-Name: $file\nContent-Length: $len\n"; print "Document-Type: $parser_type\n" if $parser_type; print "\n"; } print ${$doc->fetch_doc} if $show_content; } sub process_doc_old { my ($file) = @_; my $uri; eval { $uri = URI->new( $file ) }; my %config = !$@ && $uri->scheme ? fetch_url( $file ) : fetch_file( $file); my $was_filtered = $filter->filter( %config, name => $file, ); my $content_type = $filter->content_type || "unknown"; my $orig_content_type = $filter->original_content_type || "unknown"; my $parser_type = $filter->swish_parser_type || ''; my $binary = $content_type !~ m[^text/]; my $msg = $was_filtered ? '' : 'not'; msg(DEBUG, <fetch_doc}, 0, $max_chars ); $lines = @doc-1 if $lines >= @doc; msg(INFO, join "\n", '-- Output Content Sample --', @doc[0..$lines],'','-- end --','' ); } die "Skipping binary [$file]\n" if $binary && $skip_binary; if ($headers ) { my $len = length ${$filter->fetch_doc}; print "Path-Name: $file\nContent-Length: $len\n"; print "Document-Type: $parser_type\n" if $parser_type; print "\n"; } print ${$filter->fetch_doc} if $show_content; } sub fetch_file { my $file = shift; # just try to open for error reporting open FH, "<$file" or die "Failed to open '$file': $!\n"; close FH; die "File '$file' has zero size\n" if -z $file; return ( document => $file ); } sub fetch_url { my $url = shift; eval { require LWP::UserAgent }; die "LWP::UserAgent is required to fetch a URL\n$@\n" if $@; my $ua = LWP::UserAgent->new; my $request = HTTP::Request->new('GET', $url ); my $response = $ua->request( $request ); die sprintf "Error while getting '%s'. Server returned %s.", $response->request->uri,$response->status_line unless $response->is_success; my $content = $response->content; my $content_type = $response->content_type; return ( document => \$content, content_type => $content_type, ); } sub msg { my $msg_level = shift; return if $quiet; return if !$verbose && $msg_level > DEBUG; printf( STDERR @_), print STDERR "\n"; } __END__ =head1 NAME swish-filter-test - program to test the SWISH::Filter module. =head1 SYNOPSIS swish-filter-test [options] <...> Options: -quiet don't generate messages to stderr -content output content to stdout -(no)skip_binary skip output of binary files (default) -lines Number of lines of content to display to stderr if verbose -headers output with headers for swish-e -S prog method -verbose enable $ENV{FILTER_DEBUG} for verbose output -path output @INC path to SWISH::Filter module -help brief help message -man full documentation =head1 DESCRIPTION swish-filter-test is a program to test the Perl module SWISH::Filter. SWISH::Filter is a module that is included with the swish-e distribution. Documents supplied to this script (as a URL or a plain file) on the command line are passed to the SWISH::Filter module. This is useful for testing filters. The SWISH::Filter module works by checking a document's content-type and looking for an available filter. Most filters require additional helper programs (e.g. the filter to convert PDF to HTML requires the Xpdf package). Using the -verbose options should indicate if a required program is missing. Options to this script control how much output is generated. Options can also be specified to generate output that can be piped directly to swish-e (see C<-headers> below). All messages are sent to stderr unless --content or -headers are specified and then content is sent to stdout. =head1 OPTIONS =over 8 =item B<-quiet> Don't write info out to stderr. Normally not useful unless you just want to filter a document and not really test the SWISH::Filter module. Fatal errors are written to stderr error regardless of the -quiet option. =item B<-verbose> Enables FILTER_DEBUG mode in the SWISH::Filter module, and enables extra info including a summary of the filtered document to stderr. Enable if trying to find out why a filter does not work. =item B<-lines> Number of summary lines of summary content to show. Summary lines are only showed if -verbose is selected. Lines are sent to stderr, not stdout. Note, the summary is limited to 1000 characters regardless of this option. =item B<-content> Specifying -content will output full content to stdout. The default is to only display a few lines. =item B<-(no)skip_binary> The default is to not output content from binary files. -noskip_binary will disable this. =item B<-headers> Prints the headers used by swish-e when reading documents with -S prog. This can be used to filter documents directly to swish-e: swish-filter-test -headers -content http://localhost/ test.pdf | swish-e -S prog -i stdin -v1 =item B<-path> Prints the installed location of the SWISH::Filter parent directory for use in PERL5LIB, Allows using SWISH::Filter in other programs, or with the Swish-e -S http method with swishspider. For example: PERL5LIB=`swish-filter-test -path` swish-e -S http -i http://localhost =item B<-help> Print a brief help message and exits. =item B<-man> Prints the manual page and exits. =back =cut