# Configuration for swish-e site.
#
# a few custom callbacks are located after the @servers definition section
# these are used to split files into sections.
use URI;
use warnings;
use strict;
use vars '@servers';
my $base_path = $ENV{SWISH_SITE} || die "must set \$ENV{SWISH_SITE} (e.g SWISH_SITE=http://swish-e.org)";
$base_path =~ s[/$][]; # no trailing slash
my $top_path = URI->new("$base_path/")->path; # don't go above this.
my $start_file = $ENV{START_FILE} || 'index.html';
@servers = (
{
base_url => "$base_path/$start_file",
keep_alive => 1, # enable keep alives requests
email => 'swish@domain.invalid',
delay_sec => 0, # why wait?
use_md5 => 1,
test_url => sub {
my $path = $_[0]->path;
return 0 if $path =~ /\.(?:gif|jpeg|.png|.gz)$/i;
return 0 if $path =~ m!/archive/!; # don't index the list archive again
return 0 if $path =~ m!/search_archive/!; # and the old search script
# Don't follow any links above the base_url
return 0 unless $path =~ /^\Q$top_path/;
return 1;
},
# Only index text/html -- do we have any text/plain?
test_response => sub { return $_[2]->content_type =~ m[text/html] },
# split content - comment out to disable splitting
filter_content => \&split_page,
},
);
#===============================================================================
# split_page -
#
# This is based on HTML::Parser. More accurate than the regex method, but slower
#
#-------------------------------------------------------------------------------
sub split_page {
my %params;
@params{ qw/ uri server response content / } = @_;
$params{found} = 0;
my $doc = Swish::Split->new( \%params );
return unless $doc;
my $ret = !$doc->process;
$doc->tree->delete;
return $ret;
}
#---------------------------------------------------------------------------------
package Swish::Split;
use warnings;
use strict;
use HTML::TreeBuilder;
use HTML::Element;
sub new {
my ( $class, $params ) = @_;
# Parse the HTML into a tree
my $tree = HTML::TreeBuilder->new;
$tree->store_comments(1); # let swish decided about indexing comments
$tree->parse( ${$params->{content}} );
$tree->eof;
$params->{tree} = $tree;
# Find the head section
$params->{head} = $tree->look_down( '_tag', 'head' );
$params->{page_length} = length ${$params->{content}};
my $self = bless $params, $class;
$self->accessorize;
return $self;
}
sub accessorize {
my ( $self ) = @_;
no strict 'refs';
for my $key ( keys %$self ) {
next if $self->can( $key );
*{$key} = sub { shift->{$key} };
}
}
#==================================================================================
# Process the document tree
#
# Returns: true if tree was processed. False means still need to index file.
#
#---------------------------------------------------------------------------------
sub process {
my ( $self ) = @_;
my $uri = $self->uri;
warn "\nProcessing $uri\n" if $ENV{VERBOSE};
my $content_section = $self->tree->look_down( qw[ _tag div id main-copy ] );
unless ( $content_section ) {
warn qq[Failed to find
in $uri. Indexing full content\n];
return; # Return false indicating spider to index the page as normal
}
# Now look for content divided into sections
my @sub_sections = $content_section->look_down( qw[ _tag div class sub-section ] );
unless ( @sub_sections ) {
warn qq[Failed to find
in $uri. Indexing full content\n]
if $ENV{VERBOSE};
$self->create_page( $content_section, $self->head, $self->uri );
} else {
for ( @sub_sections ) {
my ( $new_head, $new_uri ) = $self->new_head( $_ );
$self->create_page( $_, $new_head, $new_uri );
$new_head->delete;
}
}
return 1; # says we were sucessful -- so spider should not index the page
}
#================================================================================
# new_head() -- clones the head section and returns an array of a new head and uri
#
#--------------------------------------------------------------------------------
sub new_head {
my ( $self, $section ) = @_;
my $head = $self->head->clone;
my $uri = $self->uri->clone;
my $fragment = '';
# Look for the first
tag
# So, is Swish-e a search engine?
if ( my $h_tag = $section->look_down( '_tag', qr/^h\d$/ ) ) {
my $description = $h_tag->as_text || 'missing description'; # for title
# grab the name= text for the fragment
if ( my $name = $h_tag->look_down( '_tag', 'a', sub { defined($_[0]->attr('name')) } ) ) {
$fragment = $name->attr('name');
$fragment =~ s/\n/ /g;
$uri->fragment( $fragment );
} else {
warn "Failed to find target for a section in $uri\n";
}
# Modify or create the title
my $title = $head->look_down('_tag', 'title');
if ( $title ) {
$title->push_content( ": $description" );
} else { # Create a new title
my $title = HTML::Element->new('title');
$title->push_content( $description );
$head->push_content( $title );
}
} else {
warn "Failed to find in one of the sections of $uri\n";
}
warn " -> #$fragment\n" if $ENV{VERBOSE};
return ( $head, $uri );
}
#=================================================================================
# create_page() -- creates a new HTML page and indexes it.
#
#---------------------------------------------------------------------------------
sub create_page {
my ( $self, $section, $head, $uri ) = @_;
# Add a tag to allow limiting based on the type of doc
$head->push_content (
HTML::Element->new( 'meta',
name => 'section',
content => ($uri =~ m!(?:(devel)_)?(docs)/!
? ($1 || $2)
: 'website'),
)
);
# Add the total document length, which is different than the section length
$head->push_content(
HTML::Element->new('meta', name=> 'pagelen', content => $self->page_length )
);
my $body = HTML::Element->new('body');
my $doc = HTML::Element->new('html');
$body->push_content( $section );
$doc->push_content( $head, $body );
my $new_content = $doc->as_HTML(undef,"\t");
# Fix up title - probably should get this from template
$new_content =~ s/Swish-e ::\s+//;
# This calls code in the spider function.
main::output_content( $self->server, \$new_content,
$uri, $self->response );
$doc->delete;
}
1;