1#@!#!123s

: / usr / lib / swish-e /
Filename : DirTree.pl
back
#!/usr/bin/perl -w
use strict;

## Run this program with the -man option for documentation ##


# This is set to where Swish-e's "make install" installed the helper modules.
use lib ( '/usr/lib/swish-e/perl' );


use File::Find;    # for recursing a directory tree
use Getopt::Long;
use Pod::Usage;


#--------------- User Configuration Section ------------------------
# File extensions that indicate text files even though SWISH::Filter thinks
# they might be binary based on mime.types

my @not_binary_extensions = qw/
    .pl
    .pm
    .c
    .conf
    rc
/;


# Subroutines to validate files and directories.
# Return true if file is ok to process or false to skip the file/directory
# The Default for these functions is to return true and process all
# files and directories.


sub check_path {
    my $path = shift;
    return 1;  # return true to process this file
}

sub check_dir {
    my $dir = shift;
    return 1;  # return true to process this directory
}

#-------------------- End User Config ------------------------------------





my $extensions = join '|', map { quotemeta } @not_binary_extensions;
my $textre = qr/($extensions)$/;


my %options;
GetOptions( \%options,
    'verbose!',
    'debug!',
    'symlinks!',
    'path',
    'man',
    'no_skip',
) || pod2usage(2);

pod2usage( -verbose => 2 ) if $options{man};

if ( $options{path} ) {
    print '/usr/lib/swish-e/perl',"\n";
    exit;
}


pod2usage("Must supply at least one directory") unless @ARGV;


$ENV{FILTER_DEBUG} = 1 if $options{debug};



# See perldoc File::Find for information on following symbolic links
# and other important topics.

use constant DEBUG => 0;

# Try to load the filter module

eval { require SWISH::Filter };
warn "Failed to load SWISH::Filter [$@]\n" if $@ && $options{debug};
my $filter = SWISH::Filter->new unless $@;



find(
    {
        wanted      => \&wanted,
        no_chdir    => 1,  # 5.6 feature
        follow      => $options{follow_symlinks},
    },
    @ARGV,
);

$options{verbose} = 1 if $options{debug};

sub wanted {
    my $path = $File::Find::name;

    if ( -d ) {  #stat
        if ( !check_dir( $path ) ) {
            $File::Find::prune = 1;
            warn "Skipped dir [$path] by user function check_dir()\n"
                if $options{verbose};
        }
        return;
    }

    if ( !-r _ ) {
        warn "$File::Find::name is not readable\n"
            if $options{verbose};
        return;
    }


    my $mtime = (stat _ )[9];

    if ( !check_path( $path ) ) {
        warn "Skipped path [$path] by user function check_path()\n"
            if $options{verbose};
        return;
    }


    if ( $filter ) {
        my $doc = $filter->convert(
            document    => $path,
        );
        unless ( $doc ) {
            if ( $options{no_skip} ) {
                process_file( $path, $mtime );
                return;
            }

            warn "Failed [$path] SWISH::Filter->convert failed.\n"
                if $options{verbose};
            return;
        }


        if ( $doc->is_binary && $path !~ /$textre/ ) {  # ignore "binary" files (not text/* mime type)
            warn "Skipping [$path] due to content type: " . $doc->content_type .": may be binary\n"
                if $options{verbose};
            return;
        }

        my $bytes = output_document( $path, $doc->fetch_doc, $mtime, $doc->swish_parser_type );

        if ( $options{verbose} ) {
            print STDERR "Indexed [$path]  ",
                ($doc->was_filtered ? "(Was filtered) " : "(Not filtered) "),
                $doc->content_type . " ",
                ($doc->swish_parser_type || '(parser unspecified)'),
                "  ($bytes bytes)",
                "\n";
        }
        return;
    }


    # Otherwise, fetch document manually
    process_file( $path, $mtime );

}

sub process_file {
    my ( $path, $mtime ) = @_;

    unless ( open FH, $path ) {
        warn "Failed to open '$path': $!\n";
        return;
    }
    local $/ = undef;
    my $content = <FH>;
    close FH;

    my $bytes = output_document( $path, \$content, $mtime );

    if ( $options{verbose} ) {
        print STDERR "Indexed [$path] (not processed with SWISH:Filter)  ($bytes bytes)\n";
    }

}


sub output_document {
    my ( $path, $content_ref, $mtime, $parser_type ) = @_;

    # Get the length of the content - have to worry about multi-byte content
    # ugly and maybe expensive, but perhaps more portable than "use bytes"
    my $bytecount = length pack 'C0a*', $$content_ref;

    my $header = "Path-Name: $path\nContent-Length: $bytecount\nLast-Mtime: $mtime\n";
    $header .= "Document-Type: $parser_type\n" if $parser_type;

    print $header . "\n" . $$content_ref;
}

__END__

=head1 NAME

DirTree.pl - program to fetch local documents for Swish-e

=head1 SYNOPSIS

DirTree.pl [options] directory <directory...> | swish-e -S prog -i stdin

  Options:
    -verbose        Display processing info
    -debug          Enable debugging (including SWISH::Filter debugging)
    -man            Display documentation
    -path           Display location lib path set at installation
    -no_skip        Process documents even if filtering fails
    -symlinks       Follow symbolic links.  Default is to NOT follow symlinks

=head1 DESCRIPTION

DirTree.pl is an example Perl script that can be used with Swish-e to
fetch documents from the local file system.  It works somewhat like
Swish-e's default -S fs input method (reading from the file system).
DirTree.pl will attempt to load the SWISH::Filter module for use in filtering
documents (e.g. PDF or MS Word).

DirTree.pl is a thin wrapper around Perl's File::Find module.  Before modifying
this script for your own use please read the documentation for File::Find:

    $ perldoc File::Find

IMPORTANT: By default DirTree.pl will attempt to index all files in the
directories and sub-directories supplied.  It's expected that you will
customize this script for your own needs.

When using -S prog many of the features available to select or exclude
files that can be specified in the swish-e config file will have no effect.
It's expected that checks on files will be added to the DirTree.pl program.
This is much more powerful and allows more control, but requires more work
to setup.

There are two skeleton functions at the top of DirTree.pl that can be modified
for filtering what gets indexed: check_path() and check_dir().  Both are passed
in the path or directory name as their only parameter.  Return FALSE to skip
the given path or directory.

Here's two examples:

    # Skip all .wav files.
    sub check_path {
        my $path = shift;
        return if $path =~ /\.wav$/;  # return false if ends in .wav?
        return 1;  # otherwise return true
    }

    # Skip all directories that start with a dot (hidden dirs)
    sub check_dir {
        my $dir = shift;
        return $dir !~  m[^\.];  # return true if does not start with a dot
    }

Those are called for each file or directory processed.  The File::Find module also
provides a preprocess option where all the files and directories in a directory are
passed in as a list to a subroutine.  This list can be filtered and passed back to
File::Find.  This would be useful if, say, you wanted to skip a directory if a file
"noindex" existed in the directory.  See perldoc File::Find for details.

=head2 Filtering

Filtering is the process of converting a document that swish-e cannot index into
a document that swish-e can index.

The SWISH::Filter module is used for filtering documents.  SWISH::Filter is
part of the swish-e distribution and was installed at the same time Swish-e was
installed.  SWISH::Filter uses "helper" programs to do the actual filtering.
For example, to filter PDF files you would need to have the Xpdf package
installed (included with the Windows version of Swish-e).  When SWISH::Filter
is first loaded it determines which filters are available.

SWISH::Filter uses the MIME::Types module to convert a file name into a MIME
type (e.g. .doc => application/msword) and that type is used to determine what
filter to use, if any.  Filters convert the document to a new MIME type (e.g. the MS Word
filter might convert the document to text/html or text/plain).

Binary Files

After Filtering, this program (DirTree.pl) then checks to see if the file is a binary
file.  This is a very simple test that simply looks for "text/" at the start
of the MIME type.  Clearly, this is incorrect for man MIME types.  For example,
if you were indexing Perl scripts of type "application/x-perl" this program
would think the file was binary and not index it.

At the top of the program is a list of file endings that tell DirTree.pl that
they should be indexed even if their MIME type does not start with "text/".

Another problem is some files will not map to a MIME type.  The best solution
is to add the file ending and MIME type to your mime.types file.  But, if you just
want to index any file that does not have a MIME type use the -no_skip option.


=head1 REQUIREMENTS

To use the SWISH::Filter module you will need the helper applications installed.
Check with your OS packages or Google for sources.

    PDF conversion requires the Xpdf package
    MS Word conversion requires the Catdoc package

The Windows version of Swish-e includes Xpdf and Catdoc packages.

For content type matching install the Perl Mime::Types module.

=head1 OPTIONS

A few options may be passed to DirTree.pl

=over 8

=item B<-verbose>

Produces information about each file as it is processed.

=item B<-debug>

Enables detailed debugging.  SWISH::Filter debugging is also enabled.

=item B<-no_skip>

When set documents that fail processing with SWISH::Filter will still
be processed.  Typically this means documents where a content-type could be determined.
Make sure you have the Mime::Types module installed.

=item B<-symlinks>

When specified will recurse into directories that are symbolic links.
The default is to NOT recurse into symbolic links.  This options sets the "follow"
option in the File::Find module.

=back

=head1 BUGS

May not work well on multi-byte input files.

In order to work on Windows (where two chars are used to terminate lines)
this program reads the ENTIRE file into memory so that an accurate byte count 
can be made.  Therefore, it's probably a good idea not to index files that are too big.


=head1 SUPPORT

Contact the Swish-e discussion list.  See:

    http://swish-e.org