Initial class construction
This commit is contained in:
483
Git/usr/lib/perl5/vendor_perl/HTML/Entities.pm
Normal file
483
Git/usr/lib/perl5/vendor_perl/HTML/Entities.pm
Normal file
@ -0,0 +1,483 @@
|
||||
package HTML::Entities;
|
||||
|
||||
=encoding utf8
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::Entities - Encode or decode strings with HTML entities
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use HTML::Entities;
|
||||
|
||||
$a = "Våre norske tegn bør æres";
|
||||
decode_entities($a);
|
||||
encode_entities($a, "\200-\377");
|
||||
|
||||
For example, this:
|
||||
|
||||
$input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
|
||||
print encode_entities($input), "\n"
|
||||
|
||||
Prints this out:
|
||||
|
||||
vis-à-vis Beyoncé's naïve
|
||||
papier-mâché résumé
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This module deals with encoding and decoding of strings with HTML
|
||||
character entities. The module provides the following functions:
|
||||
|
||||
=over 4
|
||||
|
||||
=item decode_entities( $string, ... )
|
||||
|
||||
This routine replaces HTML entities found in the $string with the
|
||||
corresponding Unicode character. Unrecognized entities are left alone.
|
||||
|
||||
If multiple strings are provided as argument they are each decoded
|
||||
separately and the same number of strings are returned.
|
||||
|
||||
If called in void context the arguments are decoded in-place.
|
||||
|
||||
This routine is exported by default.
|
||||
|
||||
=item _decode_entities( $string, \%entity2char )
|
||||
|
||||
=item _decode_entities( $string, \%entity2char, $expand_prefix )
|
||||
|
||||
This will in-place replace HTML entities in $string. The %entity2char
|
||||
hash must be provided. Named entities not found in the %entity2char
|
||||
hash are left alone. Numeric entities are expanded unless their value
|
||||
overflow.
|
||||
|
||||
The keys in %entity2char are the entity names to be expanded and their
|
||||
values are what they should expand into. The values do not have to be
|
||||
single character strings. If a key has ";" as suffix,
|
||||
then occurrences in $string are only expanded if properly terminated
|
||||
with ";". Entities without ";" will be expanded regardless of how
|
||||
they are terminated for compatibility with how common browsers treat
|
||||
entities in the Latin-1 range.
|
||||
|
||||
If $expand_prefix is TRUE then entities without trailing ";" in
|
||||
%entity2char will even be expanded as a prefix of a longer
|
||||
unrecognized name. The longest matching name in %entity2char will be
|
||||
used. This is mainly present for compatibility with an MSIE
|
||||
misfeature.
|
||||
|
||||
$string = "foo bar";
|
||||
_decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
|
||||
print $string; # will print "foo bar"
|
||||
|
||||
This routine is exported by default.
|
||||
|
||||
=item encode_entities( $string )
|
||||
|
||||
=item encode_entities( $string, $unsafe_chars )
|
||||
|
||||
This routine replaces unsafe characters in $string with their entity
|
||||
representation. A second argument can be given to specify which characters to
|
||||
consider unsafe. The unsafe characters is specified using the regular
|
||||
expression character class syntax (what you find within brackets in regular
|
||||
expressions).
|
||||
|
||||
The default set of characters to encode are control chars, high-bit chars, and
|
||||
the C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> characters. But this,
|
||||
for example, would encode I<just> the C<< < >>, C<< & >>, C<< > >>, and C<< "
|
||||
>> characters:
|
||||
|
||||
$encoded = encode_entities($input, '<>&"');
|
||||
|
||||
and this would only encode non-plain ascii:
|
||||
|
||||
$encoded = encode_entities($input, '^\n\x20-\x25\x27-\x7e');
|
||||
|
||||
This routine is exported by default.
|
||||
|
||||
=item encode_entities_numeric( $string )
|
||||
|
||||
=item encode_entities_numeric( $string, $unsafe_chars )
|
||||
|
||||
This routine works just like encode_entities, except that the replacement
|
||||
entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For
|
||||
example, C<encode_entities("r\xF4le")> returns "rôle", but
|
||||
C<encode_entities_numeric("r\xF4le")> returns "rôle".
|
||||
|
||||
This routine is I<not> exported by default. But you can always
|
||||
export it with C<use HTML::Entities qw(encode_entities_numeric);>
|
||||
or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
|
||||
|
||||
=back
|
||||
|
||||
All these routines modify the string passed as the first argument, if
|
||||
called in a void context. In scalar and array contexts, the encoded or
|
||||
decoded string is returned (without changing the input string).
|
||||
|
||||
If you prefer not to import these routines into your namespace, you can
|
||||
call them as:
|
||||
|
||||
use HTML::Entities ();
|
||||
$decoded = HTML::Entities::decode($a);
|
||||
$encoded = HTML::Entities::encode($a);
|
||||
$encoded = HTML::Entities::encode_numeric($a);
|
||||
|
||||
The module can also export the %char2entity and the %entity2char
|
||||
hashes, which contain the mapping from all characters to the
|
||||
corresponding entities (and vice versa, respectively).
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 1995-2006 Gisle Aas. All rights reserved.
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
|
||||
=cut
|
||||
|
||||
use strict;
|
||||
use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
|
||||
use vars qw(%entity2char %char2entity);
|
||||
|
||||
require 5.004;
|
||||
require Exporter;
|
||||
@ISA = qw(Exporter);
|
||||
|
||||
@EXPORT = qw(encode_entities decode_entities _decode_entities);
|
||||
@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
|
||||
|
||||
$VERSION = "3.69";
|
||||
sub Version { $VERSION; }
|
||||
|
||||
require HTML::Parser; # for fast XS implemented decode_entities
|
||||
|
||||
|
||||
%entity2char = (
|
||||
# Some normal chars that have special meaning in SGML context
|
||||
amp => '&', # ampersand
|
||||
'gt' => '>', # greater than
|
||||
'lt' => '<', # less than
|
||||
quot => '"', # double quote
|
||||
apos => "'", # single quote
|
||||
|
||||
# PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
|
||||
AElig => chr(198), # capital AE diphthong (ligature)
|
||||
Aacute => chr(193), # capital A, acute accent
|
||||
Acirc => chr(194), # capital A, circumflex accent
|
||||
Agrave => chr(192), # capital A, grave accent
|
||||
Aring => chr(197), # capital A, ring
|
||||
Atilde => chr(195), # capital A, tilde
|
||||
Auml => chr(196), # capital A, dieresis or umlaut mark
|
||||
Ccedil => chr(199), # capital C, cedilla
|
||||
ETH => chr(208), # capital Eth, Icelandic
|
||||
Eacute => chr(201), # capital E, acute accent
|
||||
Ecirc => chr(202), # capital E, circumflex accent
|
||||
Egrave => chr(200), # capital E, grave accent
|
||||
Euml => chr(203), # capital E, dieresis or umlaut mark
|
||||
Iacute => chr(205), # capital I, acute accent
|
||||
Icirc => chr(206), # capital I, circumflex accent
|
||||
Igrave => chr(204), # capital I, grave accent
|
||||
Iuml => chr(207), # capital I, dieresis or umlaut mark
|
||||
Ntilde => chr(209), # capital N, tilde
|
||||
Oacute => chr(211), # capital O, acute accent
|
||||
Ocirc => chr(212), # capital O, circumflex accent
|
||||
Ograve => chr(210), # capital O, grave accent
|
||||
Oslash => chr(216), # capital O, slash
|
||||
Otilde => chr(213), # capital O, tilde
|
||||
Ouml => chr(214), # capital O, dieresis or umlaut mark
|
||||
THORN => chr(222), # capital THORN, Icelandic
|
||||
Uacute => chr(218), # capital U, acute accent
|
||||
Ucirc => chr(219), # capital U, circumflex accent
|
||||
Ugrave => chr(217), # capital U, grave accent
|
||||
Uuml => chr(220), # capital U, dieresis or umlaut mark
|
||||
Yacute => chr(221), # capital Y, acute accent
|
||||
aacute => chr(225), # small a, acute accent
|
||||
acirc => chr(226), # small a, circumflex accent
|
||||
aelig => chr(230), # small ae diphthong (ligature)
|
||||
agrave => chr(224), # small a, grave accent
|
||||
aring => chr(229), # small a, ring
|
||||
atilde => chr(227), # small a, tilde
|
||||
auml => chr(228), # small a, dieresis or umlaut mark
|
||||
ccedil => chr(231), # small c, cedilla
|
||||
eacute => chr(233), # small e, acute accent
|
||||
ecirc => chr(234), # small e, circumflex accent
|
||||
egrave => chr(232), # small e, grave accent
|
||||
eth => chr(240), # small eth, Icelandic
|
||||
euml => chr(235), # small e, dieresis or umlaut mark
|
||||
iacute => chr(237), # small i, acute accent
|
||||
icirc => chr(238), # small i, circumflex accent
|
||||
igrave => chr(236), # small i, grave accent
|
||||
iuml => chr(239), # small i, dieresis or umlaut mark
|
||||
ntilde => chr(241), # small n, tilde
|
||||
oacute => chr(243), # small o, acute accent
|
||||
ocirc => chr(244), # small o, circumflex accent
|
||||
ograve => chr(242), # small o, grave accent
|
||||
oslash => chr(248), # small o, slash
|
||||
otilde => chr(245), # small o, tilde
|
||||
ouml => chr(246), # small o, dieresis or umlaut mark
|
||||
szlig => chr(223), # small sharp s, German (sz ligature)
|
||||
thorn => chr(254), # small thorn, Icelandic
|
||||
uacute => chr(250), # small u, acute accent
|
||||
ucirc => chr(251), # small u, circumflex accent
|
||||
ugrave => chr(249), # small u, grave accent
|
||||
uuml => chr(252), # small u, dieresis or umlaut mark
|
||||
yacute => chr(253), # small y, acute accent
|
||||
yuml => chr(255), # small y, dieresis or umlaut mark
|
||||
|
||||
# Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
|
||||
copy => chr(169), # copyright sign
|
||||
reg => chr(174), # registered sign
|
||||
nbsp => chr(160), # non breaking space
|
||||
|
||||
# Additional ISO-8859/1 entities listed in rfc1866 (section 14)
|
||||
iexcl => chr(161),
|
||||
cent => chr(162),
|
||||
pound => chr(163),
|
||||
curren => chr(164),
|
||||
yen => chr(165),
|
||||
brvbar => chr(166),
|
||||
sect => chr(167),
|
||||
uml => chr(168),
|
||||
ordf => chr(170),
|
||||
laquo => chr(171),
|
||||
'not' => chr(172), # not is a keyword in perl
|
||||
shy => chr(173),
|
||||
macr => chr(175),
|
||||
deg => chr(176),
|
||||
plusmn => chr(177),
|
||||
sup1 => chr(185),
|
||||
sup2 => chr(178),
|
||||
sup3 => chr(179),
|
||||
acute => chr(180),
|
||||
micro => chr(181),
|
||||
para => chr(182),
|
||||
middot => chr(183),
|
||||
cedil => chr(184),
|
||||
ordm => chr(186),
|
||||
raquo => chr(187),
|
||||
frac14 => chr(188),
|
||||
frac12 => chr(189),
|
||||
frac34 => chr(190),
|
||||
iquest => chr(191),
|
||||
'times' => chr(215), # times is a keyword in perl
|
||||
divide => chr(247),
|
||||
|
||||
( $] > 5.007 ? (
|
||||
'OElig;' => chr(338),
|
||||
'oelig;' => chr(339),
|
||||
'Scaron;' => chr(352),
|
||||
'scaron;' => chr(353),
|
||||
'Yuml;' => chr(376),
|
||||
'fnof;' => chr(402),
|
||||
'circ;' => chr(710),
|
||||
'tilde;' => chr(732),
|
||||
'Alpha;' => chr(913),
|
||||
'Beta;' => chr(914),
|
||||
'Gamma;' => chr(915),
|
||||
'Delta;' => chr(916),
|
||||
'Epsilon;' => chr(917),
|
||||
'Zeta;' => chr(918),
|
||||
'Eta;' => chr(919),
|
||||
'Theta;' => chr(920),
|
||||
'Iota;' => chr(921),
|
||||
'Kappa;' => chr(922),
|
||||
'Lambda;' => chr(923),
|
||||
'Mu;' => chr(924),
|
||||
'Nu;' => chr(925),
|
||||
'Xi;' => chr(926),
|
||||
'Omicron;' => chr(927),
|
||||
'Pi;' => chr(928),
|
||||
'Rho;' => chr(929),
|
||||
'Sigma;' => chr(931),
|
||||
'Tau;' => chr(932),
|
||||
'Upsilon;' => chr(933),
|
||||
'Phi;' => chr(934),
|
||||
'Chi;' => chr(935),
|
||||
'Psi;' => chr(936),
|
||||
'Omega;' => chr(937),
|
||||
'alpha;' => chr(945),
|
||||
'beta;' => chr(946),
|
||||
'gamma;' => chr(947),
|
||||
'delta;' => chr(948),
|
||||
'epsilon;' => chr(949),
|
||||
'zeta;' => chr(950),
|
||||
'eta;' => chr(951),
|
||||
'theta;' => chr(952),
|
||||
'iota;' => chr(953),
|
||||
'kappa;' => chr(954),
|
||||
'lambda;' => chr(955),
|
||||
'mu;' => chr(956),
|
||||
'nu;' => chr(957),
|
||||
'xi;' => chr(958),
|
||||
'omicron;' => chr(959),
|
||||
'pi;' => chr(960),
|
||||
'rho;' => chr(961),
|
||||
'sigmaf;' => chr(962),
|
||||
'sigma;' => chr(963),
|
||||
'tau;' => chr(964),
|
||||
'upsilon;' => chr(965),
|
||||
'phi;' => chr(966),
|
||||
'chi;' => chr(967),
|
||||
'psi;' => chr(968),
|
||||
'omega;' => chr(969),
|
||||
'thetasym;' => chr(977),
|
||||
'upsih;' => chr(978),
|
||||
'piv;' => chr(982),
|
||||
'ensp;' => chr(8194),
|
||||
'emsp;' => chr(8195),
|
||||
'thinsp;' => chr(8201),
|
||||
'zwnj;' => chr(8204),
|
||||
'zwj;' => chr(8205),
|
||||
'lrm;' => chr(8206),
|
||||
'rlm;' => chr(8207),
|
||||
'ndash;' => chr(8211),
|
||||
'mdash;' => chr(8212),
|
||||
'lsquo;' => chr(8216),
|
||||
'rsquo;' => chr(8217),
|
||||
'sbquo;' => chr(8218),
|
||||
'ldquo;' => chr(8220),
|
||||
'rdquo;' => chr(8221),
|
||||
'bdquo;' => chr(8222),
|
||||
'dagger;' => chr(8224),
|
||||
'Dagger;' => chr(8225),
|
||||
'bull;' => chr(8226),
|
||||
'hellip;' => chr(8230),
|
||||
'permil;' => chr(8240),
|
||||
'prime;' => chr(8242),
|
||||
'Prime;' => chr(8243),
|
||||
'lsaquo;' => chr(8249),
|
||||
'rsaquo;' => chr(8250),
|
||||
'oline;' => chr(8254),
|
||||
'frasl;' => chr(8260),
|
||||
'euro;' => chr(8364),
|
||||
'image;' => chr(8465),
|
||||
'weierp;' => chr(8472),
|
||||
'real;' => chr(8476),
|
||||
'trade;' => chr(8482),
|
||||
'alefsym;' => chr(8501),
|
||||
'larr;' => chr(8592),
|
||||
'uarr;' => chr(8593),
|
||||
'rarr;' => chr(8594),
|
||||
'darr;' => chr(8595),
|
||||
'harr;' => chr(8596),
|
||||
'crarr;' => chr(8629),
|
||||
'lArr;' => chr(8656),
|
||||
'uArr;' => chr(8657),
|
||||
'rArr;' => chr(8658),
|
||||
'dArr;' => chr(8659),
|
||||
'hArr;' => chr(8660),
|
||||
'forall;' => chr(8704),
|
||||
'part;' => chr(8706),
|
||||
'exist;' => chr(8707),
|
||||
'empty;' => chr(8709),
|
||||
'nabla;' => chr(8711),
|
||||
'isin;' => chr(8712),
|
||||
'notin;' => chr(8713),
|
||||
'ni;' => chr(8715),
|
||||
'prod;' => chr(8719),
|
||||
'sum;' => chr(8721),
|
||||
'minus;' => chr(8722),
|
||||
'lowast;' => chr(8727),
|
||||
'radic;' => chr(8730),
|
||||
'prop;' => chr(8733),
|
||||
'infin;' => chr(8734),
|
||||
'ang;' => chr(8736),
|
||||
'and;' => chr(8743),
|
||||
'or;' => chr(8744),
|
||||
'cap;' => chr(8745),
|
||||
'cup;' => chr(8746),
|
||||
'int;' => chr(8747),
|
||||
'there4;' => chr(8756),
|
||||
'sim;' => chr(8764),
|
||||
'cong;' => chr(8773),
|
||||
'asymp;' => chr(8776),
|
||||
'ne;' => chr(8800),
|
||||
'equiv;' => chr(8801),
|
||||
'le;' => chr(8804),
|
||||
'ge;' => chr(8805),
|
||||
'sub;' => chr(8834),
|
||||
'sup;' => chr(8835),
|
||||
'nsub;' => chr(8836),
|
||||
'sube;' => chr(8838),
|
||||
'supe;' => chr(8839),
|
||||
'oplus;' => chr(8853),
|
||||
'otimes;' => chr(8855),
|
||||
'perp;' => chr(8869),
|
||||
'sdot;' => chr(8901),
|
||||
'lceil;' => chr(8968),
|
||||
'rceil;' => chr(8969),
|
||||
'lfloor;' => chr(8970),
|
||||
'rfloor;' => chr(8971),
|
||||
'lang;' => chr(9001),
|
||||
'rang;' => chr(9002),
|
||||
'loz;' => chr(9674),
|
||||
'spades;' => chr(9824),
|
||||
'clubs;' => chr(9827),
|
||||
'hearts;' => chr(9829),
|
||||
'diams;' => chr(9830),
|
||||
) : ())
|
||||
);
|
||||
|
||||
|
||||
# Make the opposite mapping
|
||||
while (my($entity, $char) = each(%entity2char)) {
|
||||
$entity =~ s/;\z//;
|
||||
$char2entity{$char} = "&$entity;";
|
||||
}
|
||||
delete $char2entity{"'"}; # only one-way decoding
|
||||
|
||||
# Fill in missing entities
|
||||
for (0 .. 255) {
|
||||
next if exists $char2entity{chr($_)};
|
||||
$char2entity{chr($_)} = "&#$_;";
|
||||
}
|
||||
|
||||
my %subst; # compiled encoding regexps
|
||||
|
||||
sub encode_entities
|
||||
{
|
||||
return undef unless defined $_[0];
|
||||
my $ref;
|
||||
if (defined wantarray) {
|
||||
my $x = $_[0];
|
||||
$ref = \$x; # copy
|
||||
} else {
|
||||
$ref = \$_[0]; # modify in-place
|
||||
}
|
||||
if (defined $_[1] and length $_[1]) {
|
||||
unless (exists $subst{$_[1]}) {
|
||||
# Because we can't compile regex we fake it with a cached sub
|
||||
my $chars = $_[1];
|
||||
$chars =~ s,(?<!\\)([]/]),\\$1,g;
|
||||
$chars =~ s,(?<!\\)\\\z,\\\\,;
|
||||
my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
|
||||
$subst{$_[1]} = eval $code;
|
||||
die( $@ . " while trying to turn range: \"$_[1]\"\n "
|
||||
. "into code: $code\n "
|
||||
) if $@;
|
||||
}
|
||||
&{$subst{$_[1]}}($$ref);
|
||||
} else {
|
||||
# Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
|
||||
$$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
|
||||
}
|
||||
$$ref;
|
||||
}
|
||||
|
||||
sub encode_entities_numeric {
|
||||
local %char2entity;
|
||||
return &encode_entities; # a goto &encode_entities wouldn't work
|
||||
}
|
||||
|
||||
|
||||
sub num_entity {
|
||||
sprintf "&#x%X;", ord($_[0]);
|
||||
}
|
||||
|
||||
# Set up aliases
|
||||
*encode = \&encode_entities;
|
||||
*encode_numeric = \&encode_entities_numeric;
|
||||
*encode_numerically = \&encode_entities_numeric;
|
||||
*decode = \&decode_entities;
|
||||
|
||||
1;
|
112
Git/usr/lib/perl5/vendor_perl/HTML/Filter.pm
Normal file
112
Git/usr/lib/perl5/vendor_perl/HTML/Filter.pm
Normal file
@ -0,0 +1,112 @@
|
||||
package HTML::Filter;
|
||||
|
||||
use strict;
|
||||
use vars qw(@ISA $VERSION);
|
||||
|
||||
require HTML::Parser;
|
||||
@ISA=qw(HTML::Parser);
|
||||
|
||||
$VERSION = "3.72";
|
||||
|
||||
sub declaration { $_[0]->output("<!$_[1]>") }
|
||||
sub process { $_[0]->output($_[2]) }
|
||||
sub comment { $_[0]->output("<!--$_[1]-->") }
|
||||
sub start { $_[0]->output($_[4]) }
|
||||
sub end { $_[0]->output($_[2]) }
|
||||
sub text { $_[0]->output($_[1]) }
|
||||
|
||||
sub output { print $_[1] }
|
||||
|
||||
1;
|
||||
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::Filter - Filter HTML text through the parser
|
||||
|
||||
=head1 NOTE
|
||||
|
||||
B<This module is deprecated.> The C<HTML::Parser> now provides the
|
||||
functionally of C<HTML::Filter> much more efficiently with the
|
||||
C<default> handler.
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
require HTML::Filter;
|
||||
$p = HTML::Filter->new->parse_file("index.html");
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
C<HTML::Filter> is an HTML parser that by default prints the
|
||||
original text of each HTML element (a slow version of cat(1) basically).
|
||||
The callback methods may be overridden to modify the filtering for some
|
||||
HTML elements and you can override output() method which is called to
|
||||
print the HTML text.
|
||||
|
||||
C<HTML::Filter> is a subclass of C<HTML::Parser>. This means that
|
||||
the document should be given to the parser by calling the $p->parse()
|
||||
or $p->parse_file() methods.
|
||||
|
||||
=head1 EXAMPLES
|
||||
|
||||
The first example is a filter that will remove all comments from an
|
||||
HTML file. This is achieved by simply overriding the comment method
|
||||
to do nothing.
|
||||
|
||||
package CommentStripper;
|
||||
require HTML::Filter;
|
||||
@ISA=qw(HTML::Filter);
|
||||
sub comment { } # ignore comments
|
||||
|
||||
The second example shows a filter that will remove any E<lt>TABLE>s
|
||||
found in the HTML file. We specialize the start() and end() methods
|
||||
to count table tags and then make output not happen when inside a
|
||||
table.
|
||||
|
||||
package TableStripper;
|
||||
require HTML::Filter;
|
||||
@ISA=qw(HTML::Filter);
|
||||
sub start
|
||||
{
|
||||
my $self = shift;
|
||||
$self->{table_seen}++ if $_[0] eq "table";
|
||||
$self->SUPER::start(@_);
|
||||
}
|
||||
|
||||
sub end
|
||||
{
|
||||
my $self = shift;
|
||||
$self->SUPER::end(@_);
|
||||
$self->{table_seen}-- if $_[0] eq "table";
|
||||
}
|
||||
|
||||
sub output
|
||||
{
|
||||
my $self = shift;
|
||||
unless ($self->{table_seen}) {
|
||||
$self->SUPER::output(@_);
|
||||
}
|
||||
}
|
||||
|
||||
If you want to collect the parsed text internally you might want to do
|
||||
something like this:
|
||||
|
||||
package FilterIntoString;
|
||||
require HTML::Filter;
|
||||
@ISA=qw(HTML::Filter);
|
||||
sub output { push(@{$_[0]->{fhtml}}, $_[1]) }
|
||||
sub filtered_html { join("", @{$_[0]->{fhtml}}) }
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<HTML::Parser>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 1997-1999 Gisle Aas.
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
|
||||
=cut
|
315
Git/usr/lib/perl5/vendor_perl/HTML/HeadParser.pm
Normal file
315
Git/usr/lib/perl5/vendor_perl/HTML/HeadParser.pm
Normal file
@ -0,0 +1,315 @@
|
||||
package HTML::HeadParser;
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::HeadParser - Parse <HEAD> section of a HTML document
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
require HTML::HeadParser;
|
||||
$p = HTML::HeadParser->new;
|
||||
$p->parse($text) and print "not finished";
|
||||
|
||||
$p->header('Title') # to access <title>....</title>
|
||||
$p->header('Content-Base') # to access <base href="http://...">
|
||||
$p->header('Foo') # to access <meta http-equiv="Foo" content="...">
|
||||
$p->header('X-Meta-Author') # to access <meta name="author" content="...">
|
||||
$p->header('X-Meta-Charset') # to access <meta charset="...">
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
The C<HTML::HeadParser> is a specialized (and lightweight)
|
||||
C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD>
|
||||
section of an HTML document. The parse() method
|
||||
will return a FALSE value as soon as some E<lt>BODY> element or body
|
||||
text are found, and should not be called again after this.
|
||||
|
||||
Note that the C<HTML::HeadParser> might get confused if raw undecoded
|
||||
UTF-8 is passed to the parse() method. Make sure the strings are
|
||||
properly decoded before passing them on.
|
||||
|
||||
The C<HTML::HeadParser> keeps a reference to a header object, and the
|
||||
parser will update this header object as the various elements of the
|
||||
E<lt>HEAD> section of the HTML document are recognized. The following
|
||||
header fields are affected:
|
||||
|
||||
=over 4
|
||||
|
||||
=item Content-Base:
|
||||
|
||||
The I<Content-Base> header is initialized from the E<lt>base
|
||||
href="..."> element.
|
||||
|
||||
=item Title:
|
||||
|
||||
The I<Title> header is initialized from the E<lt>title>...E<lt>/title>
|
||||
element.
|
||||
|
||||
=item Isindex:
|
||||
|
||||
The I<Isindex> header will be added if there is a E<lt>isindex>
|
||||
element in the E<lt>head>. The header value is initialized from the
|
||||
I<prompt> attribute if it is present. If no I<prompt> attribute is
|
||||
given it will have '?' as the value.
|
||||
|
||||
=item X-Meta-Foo:
|
||||
|
||||
All E<lt>meta> elements containing a C<name> attribute will result in
|
||||
headers using the prefix C<X-Meta-> appended with the value of the
|
||||
C<name> attribute as the name of the header, and the value of the
|
||||
C<content> attribute as the pushed header value.
|
||||
|
||||
E<lt>meta> elements containing a C<http-equiv> attribute will result
|
||||
in headers as in above, but without the C<X-Meta-> prefix in the
|
||||
header name.
|
||||
|
||||
E<lt>meta> elements containing a C<charset> attribute will result in
|
||||
an C<X-Meta-Charset> header, using the value of the C<charset>
|
||||
attribute as the pushed header value.
|
||||
|
||||
The ':' character can't be represented in header field names, so
|
||||
if the meta element contains this char it's substituted with '-'
|
||||
before forming the field name.
|
||||
|
||||
=back
|
||||
|
||||
=head1 METHODS
|
||||
|
||||
The following methods (in addition to those provided by the
|
||||
superclass) are available:
|
||||
|
||||
=over 4
|
||||
|
||||
=cut
|
||||
|
||||
|
||||
require HTML::Parser;
|
||||
@ISA = qw(HTML::Parser);
|
||||
|
||||
use HTML::Entities ();
|
||||
|
||||
use strict;
|
||||
use vars qw($VERSION $DEBUG);
|
||||
#$DEBUG = 1;
|
||||
$VERSION = "3.71";
|
||||
|
||||
=item $hp = HTML::HeadParser->new
|
||||
|
||||
=item $hp = HTML::HeadParser->new( $header )
|
||||
|
||||
The object constructor. The optional $header argument should be a
|
||||
reference to an object that implement the header() and push_header()
|
||||
methods as defined by the C<HTTP::Headers> class. Normally it will be
|
||||
of some class that is a or delegates to the C<HTTP::Headers> class.
|
||||
|
||||
If no $header is given C<HTML::HeadParser> will create an
|
||||
C<HTTP::Headers> object by itself (initially empty).
|
||||
|
||||
=cut
|
||||
|
||||
sub new
|
||||
{
|
||||
my($class, $header) = @_;
|
||||
unless ($header) {
|
||||
require HTTP::Headers;
|
||||
$header = HTTP::Headers->new;
|
||||
}
|
||||
|
||||
my $self = $class->SUPER::new(api_version => 3,
|
||||
start_h => ["start", "self,tagname,attr"],
|
||||
end_h => ["end", "self,tagname"],
|
||||
text_h => ["text", "self,text"],
|
||||
ignore_elements => [qw(script style)],
|
||||
);
|
||||
$self->{'header'} = $header;
|
||||
$self->{'tag'} = ''; # name of active element that takes textual content
|
||||
$self->{'text'} = ''; # the accumulated text associated with the element
|
||||
$self;
|
||||
}
|
||||
|
||||
=item $hp->header;
|
||||
|
||||
Returns a reference to the header object.
|
||||
|
||||
=item $hp->header( $key )
|
||||
|
||||
Returns a header value. It is just a shorter way to write
|
||||
C<$hp-E<gt>header-E<gt>header($key)>.
|
||||
|
||||
=cut
|
||||
|
||||
sub header
|
||||
{
|
||||
my $self = shift;
|
||||
return $self->{'header'} unless @_;
|
||||
$self->{'header'}->header(@_);
|
||||
}
|
||||
|
||||
sub as_string # legacy
|
||||
{
|
||||
my $self = shift;
|
||||
$self->{'header'}->as_string;
|
||||
}
|
||||
|
||||
sub flush_text # internal
|
||||
{
|
||||
my $self = shift;
|
||||
my $tag = $self->{'tag'};
|
||||
my $text = $self->{'text'};
|
||||
$text =~ s/^\s+//;
|
||||
$text =~ s/\s+$//;
|
||||
$text =~ s/\s+/ /g;
|
||||
print "FLUSH $tag => '$text'\n" if $DEBUG;
|
||||
if ($tag eq 'title') {
|
||||
my $decoded;
|
||||
$decoded = utf8::decode($text) if $self->utf8_mode && defined &utf8::decode;
|
||||
HTML::Entities::decode($text);
|
||||
utf8::encode($text) if $decoded;
|
||||
$self->{'header'}->push_header(Title => $text);
|
||||
}
|
||||
$self->{'tag'} = $self->{'text'} = '';
|
||||
}
|
||||
|
||||
# This is an quote from the HTML3.2 DTD which shows which elements
|
||||
# that might be present in a <HEAD>...</HEAD>. Also note that the
|
||||
# <HEAD> tags themselves might be missing:
|
||||
#
|
||||
# <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? &
|
||||
# SCRIPT* & META* & LINK*">
|
||||
#
|
||||
# <!ELEMENT HEAD O O (%head.content)>
|
||||
#
|
||||
# From HTML 4.01:
|
||||
#
|
||||
# <!ENTITY % head.misc "SCRIPT|STYLE|META|LINK|OBJECT">
|
||||
# <!ENTITY % head.content "TITLE & BASE?">
|
||||
# <!ELEMENT HEAD O O (%head.content;) +(%head.misc;)>
|
||||
#
|
||||
# From HTML 5 as of WD-html5-20090825:
|
||||
#
|
||||
# One or more elements of metadata content, [...]
|
||||
# => base, command, link, meta, noscript, script, style, title
|
||||
|
||||
sub start
|
||||
{
|
||||
my($self, $tag, $attr) = @_; # $attr is reference to a HASH
|
||||
print "START[$tag]\n" if $DEBUG;
|
||||
$self->flush_text if $self->{'tag'};
|
||||
if ($tag eq 'meta') {
|
||||
my $key = $attr->{'http-equiv'};
|
||||
if (!defined($key) || !length($key)) {
|
||||
if ($attr->{name}) {
|
||||
$key = "X-Meta-\u$attr->{name}";
|
||||
} elsif ($attr->{charset}) { # HTML 5 <meta charset="...">
|
||||
$key = "X-Meta-Charset";
|
||||
$self->{header}->push_header($key => $attr->{charset});
|
||||
return;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
$key =~ s/:/-/g;
|
||||
$self->{'header'}->push_header($key => $attr->{content});
|
||||
} elsif ($tag eq 'base') {
|
||||
return unless exists $attr->{href};
|
||||
(my $base = $attr->{href}) =~ s/^\s+//; $base =~ s/\s+$//; # HTML5
|
||||
$self->{'header'}->push_header('Content-Base' => $base);
|
||||
} elsif ($tag eq 'isindex') {
|
||||
# This is a non-standard header. Perhaps we should just ignore
|
||||
# this element
|
||||
$self->{'header'}->push_header(Isindex => $attr->{prompt} || '?');
|
||||
} elsif ($tag =~ /^(?:title|noscript|object|command)$/) {
|
||||
# Just remember tag. Initialize header when we see the end tag.
|
||||
$self->{'tag'} = $tag;
|
||||
} elsif ($tag eq 'link') {
|
||||
return unless exists $attr->{href};
|
||||
# <link href="http:..." rel="xxx" rev="xxx" title="xxx">
|
||||
my $href = delete($attr->{href});
|
||||
$href =~ s/^\s+//; $href =~ s/\s+$//; # HTML5
|
||||
my $h_val = "<$href>";
|
||||
for (sort keys %{$attr}) {
|
||||
next if $_ eq "/"; # XHTML junk
|
||||
$h_val .= qq(; $_="$attr->{$_}");
|
||||
}
|
||||
$self->{'header'}->push_header(Link => $h_val);
|
||||
} elsif ($tag eq 'head' || $tag eq 'html') {
|
||||
# ignore
|
||||
} else {
|
||||
# stop parsing
|
||||
$self->eof;
|
||||
}
|
||||
}
|
||||
|
||||
sub end
|
||||
{
|
||||
my($self, $tag) = @_;
|
||||
print "END[$tag]\n" if $DEBUG;
|
||||
$self->flush_text if $self->{'tag'};
|
||||
$self->eof if $tag eq 'head';
|
||||
}
|
||||
|
||||
sub text
|
||||
{
|
||||
my($self, $text) = @_;
|
||||
print "TEXT[$text]\n" if $DEBUG;
|
||||
unless ($self->{first_chunk}) {
|
||||
# drop Unicode BOM if found
|
||||
if ($self->utf8_mode) {
|
||||
$text =~ s/^\xEF\xBB\xBF//;
|
||||
}
|
||||
else {
|
||||
$text =~ s/^\x{FEFF}//;
|
||||
}
|
||||
$self->{first_chunk}++;
|
||||
}
|
||||
my $tag = $self->{tag};
|
||||
if (!$tag && $text =~ /\S/) {
|
||||
# Normal text means start of body
|
||||
$self->eof;
|
||||
return;
|
||||
}
|
||||
return if $tag ne 'title';
|
||||
$self->{'text'} .= $text;
|
||||
}
|
||||
|
||||
BEGIN {
|
||||
*utf8_mode = sub { 1 } unless HTML::Entities::UNICODE_SUPPORT;
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
__END__
|
||||
|
||||
=back
|
||||
|
||||
=head1 EXAMPLE
|
||||
|
||||
$h = HTTP::Headers->new;
|
||||
$p = HTML::HeadParser->new($h);
|
||||
$p->parse(<<EOT);
|
||||
<title>Stupid example</title>
|
||||
<base href="http://www.linpro.no/lwp/">
|
||||
Normal text starts here.
|
||||
EOT
|
||||
undef $p;
|
||||
print $h->title; # should print "Stupid example"
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<HTML::Parser>, L<HTTP::Headers>
|
||||
|
||||
The C<HTTP::Headers> class is distributed as part of the
|
||||
I<libwww-perl> package. If you don't have that distribution installed
|
||||
you need to provide the $header argument to the C<HTML::HeadParser>
|
||||
constructor with your own object that implements the documented
|
||||
protocol.
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 1996-2001 Gisle Aas. All rights reserved.
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
|
||||
=cut
|
||||
|
185
Git/usr/lib/perl5/vendor_perl/HTML/LinkExtor.pm
Normal file
185
Git/usr/lib/perl5/vendor_perl/HTML/LinkExtor.pm
Normal file
@ -0,0 +1,185 @@
|
||||
package HTML::LinkExtor;
|
||||
|
||||
require HTML::Parser;
|
||||
@ISA = qw(HTML::Parser);
|
||||
$VERSION = "3.69";
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::LinkExtor - Extract links from an HTML document
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
require HTML::LinkExtor;
|
||||
$p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/");
|
||||
sub cb {
|
||||
my($tag, %links) = @_;
|
||||
print "$tag @{[%links]}\n";
|
||||
}
|
||||
$p->parse_file("index.html");
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
I<HTML::LinkExtor> is an HTML parser that extracts links from an
|
||||
HTML document. The I<HTML::LinkExtor> is a subclass of
|
||||
I<HTML::Parser>. This means that the document should be given to the
|
||||
parser by calling the $p->parse() or $p->parse_file() methods.
|
||||
|
||||
=cut
|
||||
|
||||
use strict;
|
||||
use HTML::Tagset ();
|
||||
|
||||
# legacy (some applications grabs this hash directly)
|
||||
use vars qw(%LINK_ELEMENT);
|
||||
*LINK_ELEMENT = \%HTML::Tagset::linkElements;
|
||||
|
||||
=over 4
|
||||
|
||||
=item $p = HTML::LinkExtor->new
|
||||
|
||||
=item $p = HTML::LinkExtor->new( $callback )
|
||||
|
||||
=item $p = HTML::LinkExtor->new( $callback, $base )
|
||||
|
||||
The constructor takes two optional arguments. The first is a reference
|
||||
to a callback routine. It will be called as links are found. If a
|
||||
callback is not provided, then links are just accumulated internally
|
||||
and can be retrieved by calling the $p->links() method.
|
||||
|
||||
The $base argument is an optional base URL used to absolutize all URLs found.
|
||||
You need to have the I<URI> module installed if you provide $base.
|
||||
|
||||
The callback is called with the lowercase tag name as first argument,
|
||||
and then all link attributes as separate key/value pairs. All
|
||||
non-link attributes are removed.
|
||||
|
||||
=cut
|
||||
|
||||
sub new
|
||||
{
|
||||
my($class, $cb, $base) = @_;
|
||||
my $self = $class->SUPER::new(
|
||||
start_h => ["_start_tag", "self,tagname,attr"],
|
||||
report_tags => [keys %HTML::Tagset::linkElements],
|
||||
);
|
||||
$self->{extractlink_cb} = $cb;
|
||||
if ($base) {
|
||||
require URI;
|
||||
$self->{extractlink_base} = URI->new($base);
|
||||
}
|
||||
$self;
|
||||
}
|
||||
|
||||
sub _start_tag
|
||||
{
|
||||
my($self, $tag, $attr) = @_;
|
||||
|
||||
my $base = $self->{extractlink_base};
|
||||
my $links = $HTML::Tagset::linkElements{$tag};
|
||||
$links = [$links] unless ref $links;
|
||||
|
||||
my @links;
|
||||
my $a;
|
||||
for $a (@$links) {
|
||||
next unless exists $attr->{$a};
|
||||
(my $link = $attr->{$a}) =~ s/^\s+//; $link =~ s/\s+$//; # HTML5
|
||||
push(@links, $a, $base ? URI->new($link, $base)->abs($base) : $link);
|
||||
}
|
||||
return unless @links;
|
||||
$self->_found_link($tag, @links);
|
||||
}
|
||||
|
||||
sub _found_link
|
||||
{
|
||||
my $self = shift;
|
||||
my $cb = $self->{extractlink_cb};
|
||||
if ($cb) {
|
||||
&$cb(@_);
|
||||
} else {
|
||||
push(@{$self->{'links'}}, [@_]);
|
||||
}
|
||||
}
|
||||
|
||||
=item $p->links
|
||||
|
||||
Returns a list of all links found in the document. The returned
|
||||
values will be anonymous arrays with the following elements:
|
||||
|
||||
[$tag, $attr => $url1, $attr2 => $url2,...]
|
||||
|
||||
The $p->links method will also truncate the internal link list. This
|
||||
means that if the method is called twice without any parsing
|
||||
between them the second call will return an empty list.
|
||||
|
||||
Also note that $p->links will always be empty if a callback routine
|
||||
was provided when the I<HTML::LinkExtor> was created.
|
||||
|
||||
=cut
|
||||
|
||||
sub links
|
||||
{
|
||||
my $self = shift;
|
||||
exists($self->{'links'}) ? @{delete $self->{'links'}} : ();
|
||||
}
|
||||
|
||||
# We override the parse_file() method so that we can clear the links
|
||||
# before we start a new file.
|
||||
sub parse_file
|
||||
{
|
||||
my $self = shift;
|
||||
delete $self->{'links'};
|
||||
$self->SUPER::parse_file(@_);
|
||||
}
|
||||
|
||||
=back
|
||||
|
||||
=head1 EXAMPLE
|
||||
|
||||
This is an example showing how you can extract links from a document
|
||||
received using LWP:
|
||||
|
||||
use LWP::UserAgent;
|
||||
use HTML::LinkExtor;
|
||||
use URI::URL;
|
||||
|
||||
$url = "http://www.perl.org/"; # for instance
|
||||
$ua = LWP::UserAgent->new;
|
||||
|
||||
# Set up a callback that collect image links
|
||||
my @imgs = ();
|
||||
sub callback {
|
||||
my($tag, %attr) = @_;
|
||||
return if $tag ne 'img'; # we only look closer at <img ...>
|
||||
push(@imgs, values %attr);
|
||||
}
|
||||
|
||||
# Make the parser. Unfortunately, we don't know the base yet
|
||||
# (it might be different from $url)
|
||||
$p = HTML::LinkExtor->new(\&callback);
|
||||
|
||||
# Request document and parse it as it arrives
|
||||
$res = $ua->request(HTTP::Request->new(GET => $url),
|
||||
sub {$p->parse($_[0])});
|
||||
|
||||
# Expand all image URLs to absolute ones
|
||||
my $base = $res->base;
|
||||
@imgs = map { $_ = url($_, $base)->abs; } @imgs;
|
||||
|
||||
# Print them out
|
||||
print join("\n", @imgs), "\n";
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 1996-2001 Gisle Aas.
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
|
||||
=cut
|
||||
|
||||
1;
|
1234
Git/usr/lib/perl5/vendor_perl/HTML/Parser.pm
Normal file
1234
Git/usr/lib/perl5/vendor_perl/HTML/Parser.pm
Normal file
File diff suppressed because it is too large
Load Diff
209
Git/usr/lib/perl5/vendor_perl/HTML/PullParser.pm
Normal file
209
Git/usr/lib/perl5/vendor_perl/HTML/PullParser.pm
Normal file
@ -0,0 +1,209 @@
|
||||
package HTML::PullParser;
|
||||
|
||||
require HTML::Parser;
|
||||
@ISA=qw(HTML::Parser);
|
||||
$VERSION = "3.57";
|
||||
|
||||
use strict;
|
||||
use Carp ();
|
||||
|
||||
sub new
|
||||
{
|
||||
my($class, %cnf) = @_;
|
||||
|
||||
# Construct argspecs for the various events
|
||||
my %argspec;
|
||||
for (qw(start end text declaration comment process default)) {
|
||||
my $tmp = delete $cnf{$_};
|
||||
next unless defined $tmp;
|
||||
$argspec{$_} = $tmp;
|
||||
}
|
||||
Carp::croak("Info not collected for any events")
|
||||
unless %argspec;
|
||||
|
||||
my $file = delete $cnf{file};
|
||||
my $doc = delete $cnf{doc};
|
||||
Carp::croak("Can't parse from both 'doc' and 'file' at the same time")
|
||||
if defined($file) && defined($doc);
|
||||
Carp::croak("No 'doc' or 'file' given to parse from")
|
||||
unless defined($file) || defined($doc);
|
||||
|
||||
# Create object
|
||||
$cnf{api_version} = 3;
|
||||
my $self = $class->SUPER::new(%cnf);
|
||||
|
||||
my $accum = $self->{pullparser_accum} = [];
|
||||
while (my($event, $argspec) = each %argspec) {
|
||||
$self->SUPER::handler($event => $accum, $argspec);
|
||||
}
|
||||
|
||||
if (defined $doc) {
|
||||
$self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc;
|
||||
$self->{pullparser_str_pos} = 0;
|
||||
}
|
||||
else {
|
||||
if (!ref($file) && ref(\$file) ne "GLOB") {
|
||||
require IO::File;
|
||||
$file = IO::File->new($file, "r") || return;
|
||||
}
|
||||
|
||||
$self->{pullparser_file} = $file;
|
||||
}
|
||||
$self;
|
||||
}
|
||||
|
||||
|
||||
sub handler
|
||||
{
|
||||
Carp::croak("Can't set handlers for HTML::PullParser");
|
||||
}
|
||||
|
||||
|
||||
sub get_token
|
||||
{
|
||||
my $self = shift;
|
||||
while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) {
|
||||
if (my $f = $self->{pullparser_file}) {
|
||||
# must try to parse more from the file
|
||||
my $buf;
|
||||
if (read($f, $buf, 512)) {
|
||||
$self->parse($buf);
|
||||
} else {
|
||||
$self->eof;
|
||||
$self->{pullparser_eof}++;
|
||||
delete $self->{pullparser_file};
|
||||
}
|
||||
}
|
||||
elsif (my $sref = $self->{pullparser_str_ref}) {
|
||||
# must try to parse more from the scalar
|
||||
my $pos = $self->{pullparser_str_pos};
|
||||
my $chunk = substr($$sref, $pos, 512);
|
||||
$self->parse($chunk);
|
||||
$pos += length($chunk);
|
||||
if ($pos < length($$sref)) {
|
||||
$self->{pullparser_str_pos} = $pos;
|
||||
}
|
||||
else {
|
||||
$self->eof;
|
||||
$self->{pullparser_eof}++;
|
||||
delete $self->{pullparser_str_ref};
|
||||
delete $self->{pullparser_str_pos};
|
||||
}
|
||||
}
|
||||
else {
|
||||
die;
|
||||
}
|
||||
}
|
||||
shift @{$self->{pullparser_accum}};
|
||||
}
|
||||
|
||||
|
||||
sub unget_token
|
||||
{
|
||||
my $self = shift;
|
||||
unshift @{$self->{pullparser_accum}}, @_;
|
||||
$self;
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::PullParser - Alternative HTML::Parser interface
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use HTML::PullParser;
|
||||
|
||||
$p = HTML::PullParser->new(file => "index.html",
|
||||
start => 'event, tagname, @attr',
|
||||
end => 'event, tagname',
|
||||
ignore_elements => [qw(script style)],
|
||||
) || die "Can't open: $!";
|
||||
while (my $token = $p->get_token) {
|
||||
#...do something with $token
|
||||
}
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
The HTML::PullParser is an alternative interface to the HTML::Parser class.
|
||||
It basically turns the HTML::Parser inside out. You associate a file
|
||||
(or any IO::Handle object or string) with the parser at construction time and
|
||||
then repeatedly call $parser->get_token to obtain the tags and text
|
||||
found in the parsed document.
|
||||
|
||||
The following methods are provided:
|
||||
|
||||
=over 4
|
||||
|
||||
=item $p = HTML::PullParser->new( file => $file, %options )
|
||||
|
||||
=item $p = HTML::PullParser->new( doc => \$doc, %options )
|
||||
|
||||
A C<HTML::PullParser> can be made to parse from either a file or a
|
||||
literal document based on whether the C<file> or C<doc> option is
|
||||
passed to the parser's constructor.
|
||||
|
||||
The C<file> passed in can either be a file name or a file handle
|
||||
object. If a file name is passed, and it can't be opened for reading,
|
||||
then the constructor will return an undefined value and $! will tell
|
||||
you why it failed. Otherwise the argument is taken to be some object
|
||||
that the C<HTML::PullParser> can read() from when it needs more data.
|
||||
The stream will be read() until EOF, but not closed.
|
||||
|
||||
A C<doc> can be passed plain or as a reference
|
||||
to a scalar. If a reference is passed then the value of this scalar
|
||||
should not be changed before all tokens have been extracted.
|
||||
|
||||
Next the information to be returned for the different token types must
|
||||
be set up. This is done by simply associating an argspec (as defined
|
||||
in L<HTML::Parser>) with the events you have an interest in. For
|
||||
instance, if you want C<start> tokens to be reported as the string
|
||||
C<'S'> followed by the tagname and the attributes you might pass an
|
||||
C<start>-option like this:
|
||||
|
||||
$p = HTML::PullParser->new(
|
||||
doc => $document_to_parse,
|
||||
start => '"S", tagname, @attr',
|
||||
end => '"E", tagname',
|
||||
);
|
||||
|
||||
At last other C<HTML::Parser> options, like C<ignore_tags>, and
|
||||
C<unbroken_text>, can be passed in. Note that you should not use the
|
||||
I<event>_h options to set up parser handlers. That would confuse the
|
||||
inner logic of C<HTML::PullParser>.
|
||||
|
||||
=item $token = $p->get_token
|
||||
|
||||
This method will return the next I<token> found in the HTML document,
|
||||
or C<undef> at the end of the document. The token is returned as an
|
||||
array reference. The content of this array match the argspec set up
|
||||
during C<HTML::PullParser> construction.
|
||||
|
||||
=item $p->unget_token( @tokens )
|
||||
|
||||
If you find out you have read too many tokens you can push them back,
|
||||
so that they are returned again the next time $p->get_token is called.
|
||||
|
||||
=back
|
||||
|
||||
=head1 EXAMPLES
|
||||
|
||||
The 'eg/hform' script shows how we might parse the form section of
|
||||
HTML::Documents using HTML::PullParser.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<HTML::Parser>, L<HTML::TokeParser>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 1998-2001 Gisle Aas.
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
|
||||
=cut
|
371
Git/usr/lib/perl5/vendor_perl/HTML/TokeParser.pm
Normal file
371
Git/usr/lib/perl5/vendor_perl/HTML/TokeParser.pm
Normal file
@ -0,0 +1,371 @@
|
||||
package HTML::TokeParser;
|
||||
|
||||
require HTML::PullParser;
|
||||
@ISA=qw(HTML::PullParser);
|
||||
$VERSION = "3.69";
|
||||
|
||||
use strict;
|
||||
use Carp ();
|
||||
use HTML::Entities qw(decode_entities);
|
||||
use HTML::Tagset ();
|
||||
|
||||
my %ARGS =
|
||||
(
|
||||
start => "'S',tagname,attr,attrseq,text",
|
||||
end => "'E',tagname,text",
|
||||
text => "'T',text,is_cdata",
|
||||
process => "'PI',token0,text",
|
||||
comment => "'C',text",
|
||||
declaration => "'D',text",
|
||||
|
||||
# options that default on
|
||||
unbroken_text => 1,
|
||||
);
|
||||
|
||||
|
||||
sub new
|
||||
{
|
||||
my $class = shift;
|
||||
my %cnf;
|
||||
|
||||
if (@_ == 1) {
|
||||
my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
|
||||
%cnf = ($type => $_[0]);
|
||||
}
|
||||
else {
|
||||
unshift @_, (ref($_[0]) eq "SCALAR") ? "doc" : "file" if(scalar(@_) % 2 == 1);
|
||||
%cnf = @_;
|
||||
}
|
||||
|
||||
my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
|
||||
|
||||
my $self = $class->SUPER::new(%ARGS, %cnf) || return undef;
|
||||
|
||||
$self->{textify} = $textify;
|
||||
$self;
|
||||
}
|
||||
|
||||
|
||||
sub get_tag
|
||||
{
|
||||
my $self = shift;
|
||||
my $token;
|
||||
while (1) {
|
||||
$token = $self->get_token || return undef;
|
||||
my $type = shift @$token;
|
||||
next unless $type eq "S" || $type eq "E";
|
||||
substr($token->[0], 0, 0) = "/" if $type eq "E";
|
||||
return $token unless @_;
|
||||
for (@_) {
|
||||
return $token if $token->[0] eq $_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
sub _textify {
|
||||
my($self, $token) = @_;
|
||||
my $tag = $token->[1];
|
||||
return undef unless exists $self->{textify}{$tag};
|
||||
|
||||
my $alt = $self->{textify}{$tag};
|
||||
my $text;
|
||||
if (ref($alt)) {
|
||||
$text = &$alt(@$token);
|
||||
} else {
|
||||
$text = $token->[2]{$alt || "alt"};
|
||||
$text = "[\U$tag]" unless defined $text;
|
||||
}
|
||||
return $text;
|
||||
}
|
||||
|
||||
|
||||
sub get_text
|
||||
{
|
||||
my $self = shift;
|
||||
my @text;
|
||||
while (my $token = $self->get_token) {
|
||||
my $type = $token->[0];
|
||||
if ($type eq "T") {
|
||||
my $text = $token->[1];
|
||||
decode_entities($text) unless $token->[2];
|
||||
push(@text, $text);
|
||||
} elsif ($type =~ /^[SE]$/) {
|
||||
my $tag = $token->[1];
|
||||
if ($type eq "S") {
|
||||
if (defined(my $text = _textify($self, $token))) {
|
||||
push(@text, $text);
|
||||
next;
|
||||
}
|
||||
} else {
|
||||
$tag = "/$tag";
|
||||
}
|
||||
if (!@_ || grep $_ eq $tag, @_) {
|
||||
$self->unget_token($token);
|
||||
last;
|
||||
}
|
||||
push(@text, " ")
|
||||
if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]};
|
||||
}
|
||||
}
|
||||
join("", @text);
|
||||
}
|
||||
|
||||
|
||||
sub get_trimmed_text
|
||||
{
|
||||
my $self = shift;
|
||||
my $text = $self->get_text(@_);
|
||||
$text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
|
||||
$text;
|
||||
}
|
||||
|
||||
sub get_phrase {
|
||||
my $self = shift;
|
||||
my @text;
|
||||
while (my $token = $self->get_token) {
|
||||
my $type = $token->[0];
|
||||
if ($type eq "T") {
|
||||
my $text = $token->[1];
|
||||
decode_entities($text) unless $token->[2];
|
||||
push(@text, $text);
|
||||
} elsif ($type =~ /^[SE]$/) {
|
||||
my $tag = $token->[1];
|
||||
if ($type eq "S") {
|
||||
if (defined(my $text = _textify($self, $token))) {
|
||||
push(@text, $text);
|
||||
next;
|
||||
}
|
||||
}
|
||||
if (!$HTML::Tagset::isPhraseMarkup{$tag}) {
|
||||
$self->unget_token($token);
|
||||
last;
|
||||
}
|
||||
push(@text, " ") if $tag eq "br";
|
||||
}
|
||||
}
|
||||
my $text = join("", @text);
|
||||
$text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
|
||||
$text;
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
HTML::TokeParser - Alternative HTML::Parser interface
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
require HTML::TokeParser;
|
||||
$p = HTML::TokeParser->new("index.html") ||
|
||||
die "Can't open: $!";
|
||||
$p->empty_element_tags(1); # configure its behaviour
|
||||
|
||||
while (my $token = $p->get_token) {
|
||||
#...
|
||||
}
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
The C<HTML::TokeParser> is an alternative interface to the
|
||||
C<HTML::Parser> class. It is an C<HTML::PullParser> subclass with a
|
||||
predeclared set of token types. If you wish the tokens to be reported
|
||||
differently you probably want to use the C<HTML::PullParser> directly.
|
||||
|
||||
The following methods are available:
|
||||
|
||||
=over 4
|
||||
|
||||
=item $p = HTML::TokeParser->new( $filename, %opt );
|
||||
|
||||
=item $p = HTML::TokeParser->new( $filehandle, %opt );
|
||||
|
||||
=item $p = HTML::TokeParser->new( \$document, %opt );
|
||||
|
||||
The object constructor argument is either a file name, a file handle
|
||||
object, or the complete document to be parsed. Extra options can be
|
||||
provided as key/value pairs and are processed as documented by the base
|
||||
classes.
|
||||
|
||||
If the argument is a plain scalar, then it is taken as the name of a
|
||||
file to be opened and parsed. If the file can't be opened for
|
||||
reading, then the constructor will return C<undef> and $! will tell
|
||||
you why it failed.
|
||||
|
||||
If the argument is a reference to a plain scalar, then this scalar is
|
||||
taken to be the literal document to parse. The value of this
|
||||
scalar should not be changed before all tokens have been extracted.
|
||||
|
||||
Otherwise the argument is taken to be some object that the
|
||||
C<HTML::TokeParser> can read() from when it needs more data. Typically
|
||||
it will be a filehandle of some kind. The stream will be read() until
|
||||
EOF, but not closed.
|
||||
|
||||
A newly constructed C<HTML::TokeParser> differ from its base classes
|
||||
by having the C<unbroken_text> attribute enabled by default. See
|
||||
L<HTML::Parser> for a description of this and other attributes that
|
||||
influence how the document is parsed. It is often a good idea to enable
|
||||
C<empty_element_tags> behaviour.
|
||||
|
||||
Note that the parsing result will likely not be valid if raw undecoded
|
||||
UTF-8 is used as a source. When parsing UTF-8 encoded files turn
|
||||
on UTF-8 decoding:
|
||||
|
||||
open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!";
|
||||
my $p = HTML::TokeParser->new( $fh );
|
||||
# ...
|
||||
|
||||
If a $filename is passed to the constructor the file will be opened in
|
||||
raw mode and the parsing result will only be valid if its content is
|
||||
Latin-1 or pure ASCII.
|
||||
|
||||
If parsing from an UTF-8 encoded string buffer decode it first:
|
||||
|
||||
utf8::decode($document);
|
||||
my $p = HTML::TokeParser->new( \$document );
|
||||
# ...
|
||||
|
||||
=item $p->get_token
|
||||
|
||||
This method will return the next I<token> found in the HTML document,
|
||||
or C<undef> at the end of the document. The token is returned as an
|
||||
array reference. The first element of the array will be a string
|
||||
denoting the type of this token: "S" for start tag, "E" for end tag,
|
||||
"T" for text, "C" for comment, "D" for declaration, and "PI" for
|
||||
process instructions. The rest of the token array depend on the type
|
||||
like this:
|
||||
|
||||
["S", $tag, $attr, $attrseq, $text]
|
||||
["E", $tag, $text]
|
||||
["T", $text, $is_data]
|
||||
["C", $text]
|
||||
["D", $text]
|
||||
["PI", $token0, $text]
|
||||
|
||||
where $attr is a hash reference, $attrseq is an array reference and
|
||||
the rest are plain scalars. The L<HTML::Parser/Argspec> explains the
|
||||
details.
|
||||
|
||||
=item $p->unget_token( @tokens )
|
||||
|
||||
If you find you have read too many tokens you can push them back,
|
||||
so that they are returned the next time $p->get_token is called.
|
||||
|
||||
=item $p->get_tag
|
||||
|
||||
=item $p->get_tag( @tags )
|
||||
|
||||
This method returns the next start or end tag (skipping any other
|
||||
tokens), or C<undef> if there are no more tags in the document. If
|
||||
one or more arguments are given, then we skip tokens until one of the
|
||||
specified tag types is found. For example:
|
||||
|
||||
$p->get_tag("font", "/font");
|
||||
|
||||
will find the next start or end tag for a font-element.
|
||||
|
||||
The tag information is returned as an array reference in the same form
|
||||
as for $p->get_token above, but the type code (first element) is
|
||||
missing. A start tag will be returned like this:
|
||||
|
||||
[$tag, $attr, $attrseq, $text]
|
||||
|
||||
The tagname of end tags are prefixed with "/", i.e. end tag is
|
||||
returned like this:
|
||||
|
||||
["/$tag", $text]
|
||||
|
||||
=item $p->get_text
|
||||
|
||||
=item $p->get_text( @endtags )
|
||||
|
||||
This method returns all text found at the current position. It will
|
||||
return a zero length string if the next token is not text. Any
|
||||
entities will be converted to their corresponding character.
|
||||
|
||||
If one or more arguments are given, then we return all text occurring
|
||||
before the first of the specified tags found. For example:
|
||||
|
||||
$p->get_text("p", "br");
|
||||
|
||||
will return the text up to either a paragraph of linebreak element.
|
||||
|
||||
The text might span tags that should be I<textified>. This is
|
||||
controlled by the $p->{textify} attribute, which is a hash that
|
||||
defines how certain tags can be treated as text. If the name of a
|
||||
start tag matches a key in this hash then this tag is converted to
|
||||
text. The hash value is used to specify which tag attribute to obtain
|
||||
the text from. If this tag attribute is missing, then the upper case
|
||||
name of the tag enclosed in brackets is returned, e.g. "[IMG]". The
|
||||
hash value can also be a subroutine reference. In this case the
|
||||
routine is called with the start tag token content as its argument and
|
||||
the return value is treated as the text.
|
||||
|
||||
The default $p->{textify} value is:
|
||||
|
||||
{img => "alt", applet => "alt"}
|
||||
|
||||
This means that <IMG> and <APPLET> tags are treated as text, and that
|
||||
the text to substitute can be found in the ALT attribute.
|
||||
|
||||
=item $p->get_trimmed_text
|
||||
|
||||
=item $p->get_trimmed_text( @endtags )
|
||||
|
||||
Same as $p->get_text above, but will collapse any sequences of white
|
||||
space to a single space character. Leading and trailing white space is
|
||||
removed.
|
||||
|
||||
=item $p->get_phrase
|
||||
|
||||
This will return all text found at the current position ignoring any
|
||||
phrasal-level tags. Text is extracted until the first non
|
||||
phrasal-level tag. Textification of tags is the same as for
|
||||
get_text(). This method will collapse white space in the same way as
|
||||
get_trimmed_text() does.
|
||||
|
||||
The definition of <i>phrasal-level tags</i> is obtained from the
|
||||
HTML::Tagset module.
|
||||
|
||||
=back
|
||||
|
||||
=head1 EXAMPLES
|
||||
|
||||
This example extracts all links from a document. It will print one
|
||||
line for each link, containing the URL and the textual description
|
||||
between the <A>...</A> tags:
|
||||
|
||||
use HTML::TokeParser;
|
||||
$p = HTML::TokeParser->new(shift||"index.html");
|
||||
|
||||
while (my $token = $p->get_tag("a")) {
|
||||
my $url = $token->[1]{href} || "-";
|
||||
my $text = $p->get_trimmed_text("/a");
|
||||
print "$url\t$text\n";
|
||||
}
|
||||
|
||||
This example extract the <TITLE> from the document:
|
||||
|
||||
use HTML::TokeParser;
|
||||
$p = HTML::TokeParser->new(shift||"index.html");
|
||||
if ($p->get_tag("title")) {
|
||||
my $title = $p->get_trimmed_text;
|
||||
print "Title: $title\n";
|
||||
}
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<HTML::PullParser>, L<HTML::Parser>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 1998-2005 Gisle Aas.
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
|
||||
=cut
|
Reference in New Issue
Block a user