#!/usr/bin/perl use strict; use warnings; ## See documentation below. Script will require customization use File::Find; use Date::Parse; use HTML::TreeBuilder; use Data::Dumper; my $dumb_spamblock = '(at)not-real.'; # String that is removed from email address my $dir = shift || die "must specfy directory to search"; if ( $dir eq 'debug' ) { debug(@ARGV); } $dir .= '/' unless $dir =~ /\/$/; # make path relative below # Do all the work find( { wanted => \&wanted }, $dir ); sub wanted { return if -d; # don't need to process directories return unless /^\d+\.html$/; #output_file( $File::Find::name, parse_file($_) ); output_file( $File::Find::name, fast_parse($_) ); } sub output_file { my ( $file, $data ) = @_; #$file =~ s/$dir//; # make path relative to top level local $SIG{__WARN__} = sub { "$file: @_" }; # Get last_mod date my $date = str2time( $data->{comments}{received} ); unless ( $data ) { warn "Failed to parse received date in $file\n"; $date = str2time( $data->{comments}{send} ); unless ( $date ) { warn "Failed to parse any dates: skipping $file\n"; return; } } $data->{received} = $date; my $comments = $data->{comments}; $comments->{email} =~ s/\Q$dumb_spamblock/-blabla-/; my $metas = join "\n", map { qq[] } sort keys %{$data->{comments}}; my $title = $comments->{subject} || ''; my $html = < $title $metas $data->{body} EOF my $bytecount = length pack 'C0a*', $html; print <new; $tree->store_comments(1); # meta data is in the comments $tree->warn(1); $tree->parse_file( $file ); my %comments; # Extract out metadata for ( $tree->look_down( '_tag', '~comment' )) { my $comment = $_->attr("text"); $comments{$1} = $2 if $comment =~ /(\w+)="([^"]+)/; } $data{comments} = \%comments if %comments; # should die here if not. # Extract out the searchable content my $body = $tree->look_down('_tag', 'div', 'class', 'mail'); unless ( $body ) { warn "$file: failed to find
\n"; return; } # Remove some sub-nodes we don't care about $body->look_down('_tag', 'address', 'class', 'headers')->delete; $body->look_down('_tag', 'span', 'id', 'received')->delete; $data{body} = $body->as_HTML; $tree->delete; return \%data; } sub fast_parse { my $file = shift; local $_; unless ( open FH, "<$file" ) { warn "Failed to open '$file'. Error: $!"; return; } my %data; my %comments; # First parse out the comments while () { if ( my( $tag, $content) = /$/ ) { unless ( $content ) { warn "File '$file' tag '$tag' empty content\n"; next; } last if $tag eq 'body'; # no more comments in this section $comments{$tag} = $content; } } $data{comments} = \%comments; # Now grab the content my $end_str; # for skipping sections my $body = ''; while ( ) { # loo for ending tag, or maybe even the signature last if // || /^-- $/ || /^--$/ || /^(_|-){40,}\s*$/; # Look for ending tag for a skipped tag set if ( $end_str ) { $end_str = '' if /\Q$end_str/; next; } # These are sections to skip if ( /\Q
100000 UndefinedMetaTags ignore Index the documents: $ swish-e -c swish.conf -S prog Now create the search interface: $ cp /usr/local/lib/swish-e/swish.cgi . $ cat .swishcgi.conf $ENV{TZ} = 'UTC'; # display dates in UTC format return { title => "Search the Foo List Archive", display_props => [qw/ name email swishlastmodified /], sorts => [qw/swishrank swishtitle email swishlastmodified/], metanames => [qw/swishdefault swishtitle name email/], name_labels => { swishrank => 'Rank', swishtitle => 'Subject Only', name => "Poster's Name", email => "Poster's Email", swishlastmodified => 'Message Date', swishdefault => 'Subject & Body', }, highlight => { package => 'SWISH::PhraseHighlight', xhighlight_on => '', xhighlight_off => '', meta_to_prop_map => { # this maps search metatags to display properties swishdefault => [ qw/swishtitle swishdescription/ ], swishtitle => [ qw/swishtitle/ ], email => [ qw/email/ ], name => [ qw/name/ ], swishdocpath => [ qw/swishdocpath/ ], }, }, }; Setup web server (OS/web server dependent): /var/www # ln -s /path/to/hypermail/search /var/www # ln -s /path/to/hypermail/archive and maybe tell apache to run the script: $ cat .htaccess Deny from all Allow from all SetHandler cgi-script Options +ExecCGI =head1 DESCRIPTION This script is used to parse files produced by hypermail. Last tested with hypermail pre-2.1.9. It scans the directory passed as the first parameter for files matching \d+\.html and then extracts out the content, email, name and subject. This is then passed to swish-e for indexing. The swish.cgi script is used for searching the resulting index. Configuration settings are stored in the .swish.cgi file located in the current directory. By default, swish.cgi expects the current working directory to be the location of the cgi script. On other web servers this may not be the case and you will need to edit swish.cgi to use absolute path names for .swishcgi.conf and the index files. =head1 USAGE See the SYNOPSIS above. =head1 COPYRIGHT This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO Hypermail can be downloaded from: http://hypermail.org =head1 AUTHOR Bill Moseley moseley@hank.org. 2004 =head1 SUPPORT Please contact the Swish-e discussion email list for support with this module or with Swish-e. Please do not contact the developers directly.