# Configuration for swish-e site. # # a few custom callbacks are located after the @servers definition section # these are used to split files into sections. use URI; use warnings; use strict; use vars '@servers'; my $base_path = $ENV{SWISH_SITE} || die "must set \$ENV{SWISH_SITE} (e.g SWISH_SITE=http://swish-e.org)"; $base_path =~ s[/$][]; # no trailing slash my $top_path = URI->new("$base_path/")->path; # don't go above this. my $start_file = $ENV{START_FILE} || 'index.html'; @servers = ( { base_url => "$base_path/$start_file", keep_alive => 1, # enable keep alives requests email => 'swish@domain.invalid', delay_sec => 0, # why wait? use_md5 => 1, test_url => sub { my $path = $_[0]->path; return 0 if $path =~ /\.(?:gif|jpeg|.png|.gz)$/i; return 0 if $path =~ m!/archive/!; # don't index the list archive again return 0 if $path =~ m!/search_archive/!; # and the old search script # Don't follow any links above the base_url return 0 unless $path =~ /^\Q$top_path/; return 1; }, # Only index text/html -- do we have any text/plain? test_response => sub { return $_[2]->content_type =~ m[text/html] }, # split content - comment out to disable splitting filter_content => \&split_page, }, ); #=============================================================================== # split_page - # # This is based on HTML::Parser. More accurate than the regex method, but slower # #------------------------------------------------------------------------------- sub split_page { my %params; @params{ qw/ uri server response content / } = @_; $params{found} = 0; my $doc = Swish::Split->new( \%params ); return unless $doc; my $ret = !$doc->process; $doc->tree->delete; return $ret; } #--------------------------------------------------------------------------------- package Swish::Split; use warnings; use strict; use HTML::TreeBuilder; use HTML::Element; sub new { my ( $class, $params ) = @_; # Parse the HTML into a tree my $tree = HTML::TreeBuilder->new; $tree->store_comments(1); # let swish decided about indexing comments $tree->parse( ${$params->{content}} ); $tree->eof; $params->{tree} = $tree; # Find the head section $params->{head} = $tree->look_down( '_tag', 'head' ); $params->{page_length} = length ${$params->{content}}; my $self = bless $params, $class; $self->accessorize; return $self; } sub accessorize { my ( $self ) = @_; no strict 'refs'; for my $key ( keys %$self ) { next if $self->can( $key ); *{$key} = sub { shift->{$key} }; } } #================================================================================== # Process the document tree # # Returns: true if tree was processed. False means still need to index file. # #--------------------------------------------------------------------------------- sub process { my ( $self ) = @_; my $uri = $self->uri; warn "\nProcessing $uri\n" if $ENV{VERBOSE}; my $content_section = $self->tree->look_down( qw[ _tag div id main-copy ] ); unless ( $content_section ) { warn qq[Failed to find
in $uri. Indexing full content\n]; return; # Return false indicating spider to index the page as normal } # Now look for content divided into sections my @sub_sections = $content_section->look_down( qw[ _tag div class sub-section ] ); unless ( @sub_sections ) { warn qq[Failed to find
in $uri. Indexing full content\n] if $ENV{VERBOSE}; $self->create_page( $content_section, $self->head, $self->uri ); } else { for ( @sub_sections ) { my ( $new_head, $new_uri ) = $self->new_head( $_ ); $self->create_page( $_, $new_head, $new_uri ); $new_head->delete; } } return 1; # says we were sucessful -- so spider should not index the page } #================================================================================ # new_head() -- clones the head section and returns an array of a new head and uri # #-------------------------------------------------------------------------------- sub new_head { my ( $self, $section ) = @_; my $head = $self->head->clone; my $uri = $self->uri->clone; my $fragment = ''; # Look for the first tag #

So, is Swish-e a search engine?

if ( my $h_tag = $section->look_down( '_tag', qr/^h\d$/ ) ) { my $description = $h_tag->as_text || 'missing description'; # for title # grab the name= text for the fragment if ( my $name = $h_tag->look_down( '_tag', 'a', sub { defined($_[0]->attr('name')) } ) ) { $fragment = $name->attr('name'); $fragment =~ s/\n/ /g; $uri->fragment( $fragment ); } else { warn "Failed to find target for a section in $uri\n"; } # Modify or create the title my $title = $head->look_down('_tag', 'title'); if ( $title ) { $title->push_content( ": $description" ); } else { # Create a new title my $title = HTML::Element->new('title'); $title->push_content( $description ); $head->push_content( $title ); } } else { warn "Failed to find in one of the sections of $uri\n"; } warn " -> #$fragment\n" if $ENV{VERBOSE}; return ( $head, $uri ); } #================================================================================= # create_page() -- creates a new HTML page and indexes it. # #--------------------------------------------------------------------------------- sub create_page { my ( $self, $section, $head, $uri ) = @_; # Add a tag to allow limiting based on the type of doc $head->push_content ( HTML::Element->new( 'meta', name => 'section', content => ($uri =~ m!(?:(devel)_)?(docs)/! ? ($1 || $2) : 'website'), ) ); # Add the total document length, which is different than the section length $head->push_content( HTML::Element->new('meta', name=> 'pagelen', content => $self->page_length ) ); my $body = HTML::Element->new('body'); my $doc = HTML::Element->new('html'); $body->push_content( $section ); $doc->push_content( $head, $body ); my $new_content = $doc->as_HTML(undef,"\t"); # Fix up title - probably should get this from template $new_content =~ s/Swish-e ::\s+/<title>/; # This calls code in the spider function. main::output_content( $self->server, \$new_content, $uri, $self->response ); $doc->delete; } 1;