Initial class construction
This commit is contained in:
209
Git/usr/lib/perl5/vendor_perl/HTML/PullParser.pm
Normal file
209
Git/usr/lib/perl5/vendor_perl/HTML/PullParser.pm
Normal file
@ -0,0 +1,209 @@
|
||||
package HTML::PullParser;
|
||||
|
||||
require HTML::Parser;
|
||||
@ISA=qw(HTML::Parser);
|
||||
$VERSION = "3.57";
|
||||
|
||||
use strict;
|
||||
use Carp ();
|
||||
|
||||
sub new
|
||||
{
|
||||
my($class, %cnf) = @_;
|
||||
|
||||
# Construct argspecs for the various events
|
||||
my %argspec;
|
||||
for (qw(start end text declaration comment process default)) {
|
||||
my $tmp = delete $cnf{$_};
|
||||
next unless defined $tmp;
|
||||
$argspec{$_} = $tmp;
|
||||
}
|
||||
Carp::croak("Info not collected for any events")
|
||||
unless %argspec;
|
||||
|
||||
my $file = delete $cnf{file};
|
||||
my $doc = delete $cnf{doc};
|
||||
Carp::croak("Can't parse from both 'doc' and 'file' at the same time")
|
||||
if defined($file) && defined($doc);
|
||||
Carp::croak("No 'doc' or 'file' given to parse from")
|
||||
unless defined($file) || defined($doc);
|
||||
|
||||
# Create object
|
||||
$cnf{api_version} = 3;
|
||||
my $self = $class->SUPER::new(%cnf);
|
||||
|
||||
my $accum = $self->{pullparser_accum} = [];
|
||||
while (my($event, $argspec) = each %argspec) {
|
||||
$self->SUPER::handler($event => $accum, $argspec);
|
||||
}
|
||||
|
||||
if (defined $doc) {
|
||||
$self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc;
|
||||
$self->{pullparser_str_pos} = 0;
|
||||
}
|
||||
else {
|
||||
if (!ref($file) && ref(\$file) ne "GLOB") {
|
||||
require IO::File;
|
||||
$file = IO::File->new($file, "r") || return;
|
||||
}
|
||||
|
||||
$self->{pullparser_file} = $file;
|
||||
}
|
||||
$self;
|
||||
}
|
||||
|
||||
|
||||
sub handler
|
||||
{
|
||||
Carp::croak("Can't set handlers for HTML::PullParser");
|
||||
}
|
||||
|
||||
|
||||
sub get_token
|
||||
{
|
||||
my $self = shift;
|
||||
while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) {
|
||||
if (my $f = $self->{pullparser_file}) {
|
||||
# must try to parse more from the file
|
||||
my $buf;
|
||||
if (read($f, $buf, 512)) {
|
||||
$self->parse($buf);
|
||||
} else {
|
||||
$self->eof;
|
||||
$self->{pullparser_eof}++;
|
||||
delete $self->{pullparser_file};
|
||||
}
|
||||
}
|
||||
elsif (my $sref = $self->{pullparser_str_ref}) {
|
||||
# must try to parse more from the scalar
|
||||
my $pos = $self->{pullparser_str_pos};
|
||||
my $chunk = substr($$sref, $pos, 512);
|
||||
$self->parse($chunk);
|
||||
$pos += length($chunk);
|
||||
if ($pos < length($$sref)) {
|
||||
$self->{pullparser_str_pos} = $pos;
|
||||
}
|
||||
else {
|
||||
$self->eof;
|
||||
$self->{pullparser_eof}++;
|
||||
delete $self->{pullparser_str_ref};
|
||||
delete $self->{pullparser_str_pos};
|
||||
}
|
||||
}
|
||||
else {
|
||||
die;
|
||||
}
|
||||
}
|
||||
shift @{$self->{pullparser_accum}};
|
||||
}
|
||||
|
||||
|
||||
sub unget_token
|
||||
{
|
||||
my $self = shift;
|
||||
unshift @{$self->{pullparser_accum}}, @_;
|
||||
$self;
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::PullParser - Alternative HTML::Parser interface
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use HTML::PullParser;
|
||||
|
||||
$p = HTML::PullParser->new(file => "index.html",
|
||||
start => 'event, tagname, @attr',
|
||||
end => 'event, tagname',
|
||||
ignore_elements => [qw(script style)],
|
||||
) || die "Can't open: $!";
|
||||
while (my $token = $p->get_token) {
|
||||
#...do something with $token
|
||||
}
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
The HTML::PullParser is an alternative interface to the HTML::Parser class.
|
||||
It basically turns the HTML::Parser inside out. You associate a file
|
||||
(or any IO::Handle object or string) with the parser at construction time and
|
||||
then repeatedly call $parser->get_token to obtain the tags and text
|
||||
found in the parsed document.
|
||||
|
||||
The following methods are provided:
|
||||
|
||||
=over 4
|
||||
|
||||
=item $p = HTML::PullParser->new( file => $file, %options )
|
||||
|
||||
=item $p = HTML::PullParser->new( doc => \$doc, %options )
|
||||
|
||||
A C<HTML::PullParser> can be made to parse from either a file or a
|
||||
literal document based on whether the C<file> or C<doc> option is
|
||||
passed to the parser's constructor.
|
||||
|
||||
The C<file> passed in can either be a file name or a file handle
|
||||
object. If a file name is passed, and it can't be opened for reading,
|
||||
then the constructor will return an undefined value and $! will tell
|
||||
you why it failed. Otherwise the argument is taken to be some object
|
||||
that the C<HTML::PullParser> can read() from when it needs more data.
|
||||
The stream will be read() until EOF, but not closed.
|
||||
|
||||
A C<doc> can be passed plain or as a reference
|
||||
to a scalar. If a reference is passed then the value of this scalar
|
||||
should not be changed before all tokens have been extracted.
|
||||
|
||||
Next the information to be returned for the different token types must
|
||||
be set up. This is done by simply associating an argspec (as defined
|
||||
in L<HTML::Parser>) with the events you have an interest in. For
|
||||
instance, if you want C<start> tokens to be reported as the string
|
||||
C<'S'> followed by the tagname and the attributes you might pass an
|
||||
C<start>-option like this:
|
||||
|
||||
$p = HTML::PullParser->new(
|
||||
doc => $document_to_parse,
|
||||
start => '"S", tagname, @attr',
|
||||
end => '"E", tagname',
|
||||
);
|
||||
|
||||
At last other C<HTML::Parser> options, like C<ignore_tags>, and
|
||||
C<unbroken_text>, can be passed in. Note that you should not use the
|
||||
I<event>_h options to set up parser handlers. That would confuse the
|
||||
inner logic of C<HTML::PullParser>.
|
||||
|
||||
=item $token = $p->get_token
|
||||
|
||||
This method will return the next I<token> found in the HTML document,
|
||||
or C<undef> at the end of the document. The token is returned as an
|
||||
array reference. The content of this array match the argspec set up
|
||||
during C<HTML::PullParser> construction.
|
||||
|
||||
=item $p->unget_token( @tokens )
|
||||
|
||||
If you find out you have read too many tokens you can push them back,
|
||||
so that they are returned again the next time $p->get_token is called.
|
||||
|
||||
=back
|
||||
|
||||
=head1 EXAMPLES
|
||||
|
||||
The 'eg/hform' script shows how we might parse the form section of
|
||||
HTML::Documents using HTML::PullParser.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<HTML::Parser>, L<HTML::TokeParser>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 1998-2001 Gisle Aas.
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
|
||||
=cut
|
Reference in New Issue
Block a user