package HTML::ParseTables;

=head1 NAME

HTML::ParseTables - Extract text from HTML tables. Version 0.05 (pre-alpha)

=cut

use strict;
use HTML::Parser;
use vars qw( $VERSION @ISA @EXPORT @EXPORT_OK
             %config
             %TABLES        
             $table_count
             $depth      
             $row          
             $cell         
             $index      
             $incell     
             $debug
           );
require Exporter;

@ISA = qw(HTML::Parser Exporter);
@EXPORT = qw();
@EXPORT_OK = qw(%config);

$VERSION = '0.05';
$debug = 0;
local $[ = 1; ## !! BEWARE: Array index of first element changed to 1 instead of 0!! ##

%config = (
            'trim'    => 1,
            'format' => 'tab delimited',
          );

my %tags = (
    'table_start'    =>    sub {&table_start},
    'table_end'        =>    sub {&table_end},
    'td_start'        =>    sub {&td_start},
    'td_end'        =>    sub {&td_end},
    'tr_start'        =>    sub {&tr_start},
    'tr_end'        =>    sub {&tr_end},
    'th_start'        =>    sub {&th_start},
    'th_end'        =>    sub {&th_end},
);


sub new {
    my($class, $html) = @_;
    my $self = $class->SUPER::new;
    
    %TABLES      = ();                        
    $table_count = 0;                    
    $depth       = 0; # for nested tables
    $row         = 0;                       
    $cell        = 0;                       
    $index       = 0;                    
    $incell      = 0;                         

    $self->parse($html) if $html;
    $self;
}   
    
sub start {                   # called by parse
    my $this = shift;
    my ($tag, $attr) = @_;
    
    &{ $tags{$tag . "_start"} }($attr) if ( exists $tags{$tag . "_start"} );
}

sub get_table {
    my ($self, $table) = @_;
    $table = $index unless $table; # default table is last
    print STDERR "get_table will return table $table (", $TABLES{$table}, ")\n" if $debug > 4;
    return @{ $TABLES{$table} };
}
sub get_table_as_text {
    my ($self, $table) = @_;
    $table = $index unless $table; # default table is last
    
    my $separator = " "; # default

    if ($config{'format'} =~ /tab/io) {
        # wants tab delimited
        $separator = "\t";
    }
    elsif ( $config{'format'} =~ /csv|comma/io) {
        # wants comma separated delimited
        $separator = ", ";
    }

    my ($row, @rows, $text);
    foreach $row ( @{$TABLES{$table}} ) {
        $text .= join($separator, @{$row}) . "\n";
    }
    chomp $text;
    return $text;
}
sub get_row {
    my ($self, $table, $row) = @_;
    if ($row) {
        return @{ $TABLES{$table}->[$row] };
    }
    else {
        # $table was skipped. use default (last) table
        # row ended up in $table
        return @{ $TABLES{$index}->[$table] };
    }
}
sub get_cell {
    my $self = shift;
    my ($table, $column, $row);
    
    # Accept:
    #     get_cell($table, $column, $row)
    #     get_cell($table, 'B3')
    #    get_cell($column, $row)
    #    get_cell('A5')

    if (scalar(@_) == 3) {
        ($table, $column, $row) = @_ ;
        print STDERR "get_cell called with 3 ($table, $column, $row)\n" if $debug > 5;
    }
    elsif (scalar(@_) == 2) {
        $table = shift;
        my $colrow = shift;
        print STDERR "get_cell called with 2 ($table, $colrow)\n" if $debug > 5;
        if ( $colrow =~ /([A-Za-z])(\d+)/o ) {
            # 'A1' notation
            $column = $1;
            $row = $2;
            print STDERR "translated to col=$column, row=$row\n" if $debug > 5;
        }
        else {
            # Not 'A1' notation, so assume table skipped and we got $column and $row
            $row = $colrow;
            $column = $table;
            $table = $index;
            print STDERR "translated to col=$column, row=$row\n" if $debug > 5;
        }
    }
    elsif (scalar(@_) == 1) {
        $table = $index;
        my $colrow = shift;
        print STDERR "get_cell called with 1 ($colrow)\n" if $debug > 5;
        if ( $colrow =~ /([A-Za-z])(\d+)/o ) {
            # 'A1' notation ( > 25 columns ('ZB1' etc...) not supported)
            $column = $1;
            $row = $2;
            print STDERR "translated to col=$column, row=$row\n" if $debug > 5;
        }
        else {
            # Not 'A1' notation, so error
            print STDERR "Not A1 notation! error\n" if $debug > 5;
            return undef;
        }
    }
    else {
        print STDERR "No argument! error\n" if $debug > 5;
        return undef;
    }
    if ($column =~ /[A-Za-z]/o) {
        # ( > 25 columns ('ZB1' etc...) not supported)
        $column = ord(uc $column) - 64;
        print STDERR "Translated column to $column\n" if $debug > 5;
    }
    
    return @{ $TABLES{$table}->[$row] }[$column];
} # end sub get_cell

sub end {
    my $this = shift;
    my ($tag, $attr) = @_;
    &{ $tags{$tag . "_end"} }($attr) if ( exists $tags{$tag . "_end"} );
}

sub text {
    my ($self, $text) = @_;
    return unless $incell;
    $text = HTML::Entities::decode($text);
    print STDERR "Text started (table: $table_count, row: $row, cell: $cell):$text<\n" if $debug > 3;
    $TABLES{$table_count}->[$row]->[$cell] .= $text;
}

sub table_start {
    my $attr = shift;
    $table_count++;
    $index++;
    $row = 0;
    $cell = 0;
    print STDERR "Table $table_count started:\n" if $debug > 5;
    $TABLES{$table_count} = []; # new table array
}
sub table_end {
}

sub tr_start {
    my $attr = shift;
    $cell = 0;
    $row++;
    print STDERR "Table row $row started:\n" if $debug > 4;
}
sub tr_end {
    my $attr = shift;
    print STDERR "Table row ended:\n" if $debug > 4;
}
sub th_start {
    my $attr = shift;
    $cell++;
    print STDERR "Table H started:\n" if $debug >2;#, Dumper($attr);
    $incell = 1;
}
sub th_end {
    my $attr = shift;
    print STDERR "Table H ended:\n" if $debug >2;#, Dumper($attr);
    $incell = 0;
    my $text = $TABLES{$table_count}->[$row]->[$cell];
    if ($config{'trim'}) {
        $text =~ s/^\s+//o;
        $text =~ s/\s+$//o;
    }
    $TABLES{$table_count}->[$row]->[$cell] = $text;

}
sub td_start {
    my $attr = shift;
    $cell++;
    print STDERR "Table Data started:\n" if $debug >2;#, Dumper($attr);
    $incell = 1;

}
sub td_end {
    my $attr = shift;
    print STDERR "Table Data end:\n" if $debug >2;#, Dumper($attr);
    $incell = 0;

    my $text = $TABLES{$table_count}->[$row]->[$cell];
    if ($text && $config{'trim'}) {
        $text =~ s/^\s+//o;
        $text =~ s/\s+$//o;
    }
    $TABLES{$table_count}->[$row]->[$cell] = $text;
}

# "Properties"
sub table_count  { $table_count };

sub row_count    {
    my ($self, $table) = @_;
    $table = $index unless $table; # default to last table
    return scalar( @{$TABLES{$table}} );
};
sub cell_count    {
    my ($self, $table, $row) = @_;
    # ++ accept default to last table
    return scalar( @{$TABLES{$table}[$row]} );
};

sub all_tables {%TABLES};

1;
__END__

=head1 SYNOPSIS

    use HTML::ParseTables;

    my $p = new HTML::ParseTables();
    if ($html = shift) {
        $p->parse_file($html)
    }
    else {
        while (<DATA>) { $p->parse($_) }
    }
    
    print $p->table_count, " tables found\n";
    my %h = $p->all_tables;

    foreach $table (sort keys %h) {
        my @rows = $p->get_table($table);
        my $row_count = 0;
        print "\nTABLE $table. ",
              $p->row_count($table),
              " rows:\n";
        foreach $row (@rows) {
            print ++$row_count,
                  " (", scalar(@{$row}), " cells)\t",
                  join("\t", @{$row}), "\n";
        }
    }

    print "Table 1, Cell B2    : ",
          $p->get_cell(1, 'B2'), "\n";
    print "Last table, cell A1 : ",
          $p->get_cell('A1'), "\n";

__DATA__

    <HTML><BODY>
    <P> paragraph before table </P>
    <TABLE>
        <TR> <TD>A1</TD> <TD>B1</TD> </TR>
        <TR> <TD>A2</TD> <TD>B2</TD> </TR>
    </TABLE>
    <TABLE>
        <TR> <TD>T2-A1</TD> <TD>T2-B1</TD> </TR>
        <TR> <TD>T2-A2</TD> <TD>T2-B2</TD> </TR>
    </TABLE>

=head1 DESCRIPTION

Easy extraction of text from HTML documents containing tables. Tries to focus
on an intuitive interface to get at table content. Particularly, it allows
different notations to to get at individual cells, among which the popular
spreadsheet "B2" notation.

This version is to be considered "pre-alpha": it may contain many bugs, lots of things
are not documented and the interface may change quickly. For the documentation, the
only reliable thing to do is to look at the code. I have no time to polish it now, but
I was asked to post it, so here it is.

It works well for me in a few scripts that run daily, so hopefully you can use it too.

=head1 DETAILS

Important: the module uses 1 as the index to the first table/column/row/cell, not 0!

=head2 %config

Should allow setting of user preferences for output and things retained during parsing.

=head2 get_table([$table])

Returns table $table as a list of rows (rows being references to a list of cells).
First table is table 1. Without argument, returns last table.

=head2 get_table_as_text([$table])

Returns table $table as a string. Newlines between rows. The separator between cells depends
on $config{format}. Without argument, returns last table.

=head2 get_row([$table,] $row)

Returns row $row from table $table as a list of cells.
If $table is omitted, uses last table. First row is 1.

=head2 get_cell()

Accept different formats:

    get_cell($table, $column, $row)
     get_cell($table, 'B3')
    get_cell($column, $row)
    get_cell('A5')

Returns the cell content. If $table is omitted, uses last.

=head2 table_count()

Returns number of tables found. Takes no argument.

=head2 row_count([$table])

Returns number of rows in table $table or last table.

=head2 cell_count($table, $row)

Returns number of cells in row $row of table $table.

=head2 all_tables

Returns a hash with all tables. Keys are numbers from 1 to the number of tables.
Values are references to lists of lists (rows of cells).

=head1 LIMITATIONS

Lot's for now:

Doesn't handle nested tables.

Doesn't understand colspan and rowspan.

Documentation incomplete and possibly even wrong.

This is really not finished.

...?

=head1 BUGS

Let me know what you find

=head1 AUTHOR

Milivoj Ivkovic <mi@alma.ch>. Others welcome to extend it to more
operating systems which don't have an uptime command.

=head1 COPYRIGHT

Copyright Milivoj Ivkovic, 1999. Same license as Perl itself.

=cut
