# !/usr/bin/perl -w
use strict;
## See documentation below. Script may require customization
## read documentation with "perldoc index_hypermail.pl"
use File::Find;
use Date::Parse;
use HTML::TreeBuilder;
use Data::Dumper;
## This is the string that is removed while indexing from email addresses
## as defined in the hypermailrc file.
#
#---------------------- config -----------------------------------------------------
my $dumb_spamblock = '(at)not-real.';
#------------------------------------------------------------------------------------
my $dir = shift || die "must specfy directory to search";
debug(@ARGV) if $dir eq 'debug';
# Do all the work
find( { wanted => \&wanted }, $dir );
sub wanted {
return if -d; # don't need to process directories
return unless /^\d+\.html$/;
# If you want it to parse using HTML::Parser use the first line
# and comment out the second. But it's a LOT slower
#output_file( $File::Find::name, parse_file($_) );
output_file( $File::Find::name, fast_parse($_) );
}
sub output_file {
my ( $file, $data ) = @_;
local $SIG{__WARN__} = sub { "$file: @_" };
# Get last_mod date
my $date = str2time( $data->{comments}{received} );
unless ( $data ) {
warn "Failed to parse received date in $file\n";
$date = str2time( $data->{comments}{send} );
unless ( $date ) {
warn "Failed to parse any dates: skipping $file\n";
return;
}
}
$data->{received} = $date;
my $comments = $data->{comments};
$comments->{email} =~ s/\Q$dumb_spamblock/-blabla-/;
my $metas = join "\n", map { qq[] }
sort keys %{$data->{comments}};
my $title = $comments->{subject} || '';
my $html = <
$title
$metas
$data->{body}