484 lines
15 KiB
Perl
484 lines
15 KiB
Perl
|
package HTML::Entities;
|
|||
|
|
|||
|
=encoding utf8
|
|||
|
|
|||
|
=head1 NAME
|
|||
|
|
|||
|
HTML::Entities - Encode or decode strings with HTML entities
|
|||
|
|
|||
|
=head1 SYNOPSIS
|
|||
|
|
|||
|
use HTML::Entities;
|
|||
|
|
|||
|
$a = "Våre norske tegn bør æres";
|
|||
|
decode_entities($a);
|
|||
|
encode_entities($a, "\200-\377");
|
|||
|
|
|||
|
For example, this:
|
|||
|
|
|||
|
$input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
|
|||
|
print encode_entities($input), "\n"
|
|||
|
|
|||
|
Prints this out:
|
|||
|
|
|||
|
vis-à-vis Beyoncé's naïve
|
|||
|
papier-mâché résumé
|
|||
|
|
|||
|
=head1 DESCRIPTION
|
|||
|
|
|||
|
This module deals with encoding and decoding of strings with HTML
|
|||
|
character entities. The module provides the following functions:
|
|||
|
|
|||
|
=over 4
|
|||
|
|
|||
|
=item decode_entities( $string, ... )
|
|||
|
|
|||
|
This routine replaces HTML entities found in the $string with the
|
|||
|
corresponding Unicode character. Unrecognized entities are left alone.
|
|||
|
|
|||
|
If multiple strings are provided as argument they are each decoded
|
|||
|
separately and the same number of strings are returned.
|
|||
|
|
|||
|
If called in void context the arguments are decoded in-place.
|
|||
|
|
|||
|
This routine is exported by default.
|
|||
|
|
|||
|
=item _decode_entities( $string, \%entity2char )
|
|||
|
|
|||
|
=item _decode_entities( $string, \%entity2char, $expand_prefix )
|
|||
|
|
|||
|
This will in-place replace HTML entities in $string. The %entity2char
|
|||
|
hash must be provided. Named entities not found in the %entity2char
|
|||
|
hash are left alone. Numeric entities are expanded unless their value
|
|||
|
overflow.
|
|||
|
|
|||
|
The keys in %entity2char are the entity names to be expanded and their
|
|||
|
values are what they should expand into. The values do not have to be
|
|||
|
single character strings. If a key has ";" as suffix,
|
|||
|
then occurrences in $string are only expanded if properly terminated
|
|||
|
with ";". Entities without ";" will be expanded regardless of how
|
|||
|
they are terminated for compatibility with how common browsers treat
|
|||
|
entities in the Latin-1 range.
|
|||
|
|
|||
|
If $expand_prefix is TRUE then entities without trailing ";" in
|
|||
|
%entity2char will even be expanded as a prefix of a longer
|
|||
|
unrecognized name. The longest matching name in %entity2char will be
|
|||
|
used. This is mainly present for compatibility with an MSIE
|
|||
|
misfeature.
|
|||
|
|
|||
|
$string = "foo bar";
|
|||
|
_decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
|
|||
|
print $string; # will print "foo bar"
|
|||
|
|
|||
|
This routine is exported by default.
|
|||
|
|
|||
|
=item encode_entities( $string )
|
|||
|
|
|||
|
=item encode_entities( $string, $unsafe_chars )
|
|||
|
|
|||
|
This routine replaces unsafe characters in $string with their entity
|
|||
|
representation. A second argument can be given to specify which characters to
|
|||
|
consider unsafe. The unsafe characters is specified using the regular
|
|||
|
expression character class syntax (what you find within brackets in regular
|
|||
|
expressions).
|
|||
|
|
|||
|
The default set of characters to encode are control chars, high-bit chars, and
|
|||
|
the C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> characters. But this,
|
|||
|
for example, would encode I<just> the C<< < >>, C<< & >>, C<< > >>, and C<< "
|
|||
|
>> characters:
|
|||
|
|
|||
|
$encoded = encode_entities($input, '<>&"');
|
|||
|
|
|||
|
and this would only encode non-plain ascii:
|
|||
|
|
|||
|
$encoded = encode_entities($input, '^\n\x20-\x25\x27-\x7e');
|
|||
|
|
|||
|
This routine is exported by default.
|
|||
|
|
|||
|
=item encode_entities_numeric( $string )
|
|||
|
|
|||
|
=item encode_entities_numeric( $string, $unsafe_chars )
|
|||
|
|
|||
|
This routine works just like encode_entities, except that the replacement
|
|||
|
entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For
|
|||
|
example, C<encode_entities("r\xF4le")> returns "rôle", but
|
|||
|
C<encode_entities_numeric("r\xF4le")> returns "rôle".
|
|||
|
|
|||
|
This routine is I<not> exported by default. But you can always
|
|||
|
export it with C<use HTML::Entities qw(encode_entities_numeric);>
|
|||
|
or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
|
|||
|
|
|||
|
=back
|
|||
|
|
|||
|
All these routines modify the string passed as the first argument, if
|
|||
|
called in a void context. In scalar and array contexts, the encoded or
|
|||
|
decoded string is returned (without changing the input string).
|
|||
|
|
|||
|
If you prefer not to import these routines into your namespace, you can
|
|||
|
call them as:
|
|||
|
|
|||
|
use HTML::Entities ();
|
|||
|
$decoded = HTML::Entities::decode($a);
|
|||
|
$encoded = HTML::Entities::encode($a);
|
|||
|
$encoded = HTML::Entities::encode_numeric($a);
|
|||
|
|
|||
|
The module can also export the %char2entity and the %entity2char
|
|||
|
hashes, which contain the mapping from all characters to the
|
|||
|
corresponding entities (and vice versa, respectively).
|
|||
|
|
|||
|
=head1 COPYRIGHT
|
|||
|
|
|||
|
Copyright 1995-2006 Gisle Aas. All rights reserved.
|
|||
|
|
|||
|
This library is free software; you can redistribute it and/or
|
|||
|
modify it under the same terms as Perl itself.
|
|||
|
|
|||
|
=cut
|
|||
|
|
|||
|
use strict;
|
|||
|
use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
|
|||
|
use vars qw(%entity2char %char2entity);
|
|||
|
|
|||
|
require 5.004;
|
|||
|
require Exporter;
|
|||
|
@ISA = qw(Exporter);
|
|||
|
|
|||
|
@EXPORT = qw(encode_entities decode_entities _decode_entities);
|
|||
|
@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
|
|||
|
|
|||
|
$VERSION = "3.69";
|
|||
|
sub Version { $VERSION; }
|
|||
|
|
|||
|
require HTML::Parser; # for fast XS implemented decode_entities
|
|||
|
|
|||
|
|
|||
|
%entity2char = (
|
|||
|
# Some normal chars that have special meaning in SGML context
|
|||
|
amp => '&', # ampersand
|
|||
|
'gt' => '>', # greater than
|
|||
|
'lt' => '<', # less than
|
|||
|
quot => '"', # double quote
|
|||
|
apos => "'", # single quote
|
|||
|
|
|||
|
# PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
|
|||
|
AElig => chr(198), # capital AE diphthong (ligature)
|
|||
|
Aacute => chr(193), # capital A, acute accent
|
|||
|
Acirc => chr(194), # capital A, circumflex accent
|
|||
|
Agrave => chr(192), # capital A, grave accent
|
|||
|
Aring => chr(197), # capital A, ring
|
|||
|
Atilde => chr(195), # capital A, tilde
|
|||
|
Auml => chr(196), # capital A, dieresis or umlaut mark
|
|||
|
Ccedil => chr(199), # capital C, cedilla
|
|||
|
ETH => chr(208), # capital Eth, Icelandic
|
|||
|
Eacute => chr(201), # capital E, acute accent
|
|||
|
Ecirc => chr(202), # capital E, circumflex accent
|
|||
|
Egrave => chr(200), # capital E, grave accent
|
|||
|
Euml => chr(203), # capital E, dieresis or umlaut mark
|
|||
|
Iacute => chr(205), # capital I, acute accent
|
|||
|
Icirc => chr(206), # capital I, circumflex accent
|
|||
|
Igrave => chr(204), # capital I, grave accent
|
|||
|
Iuml => chr(207), # capital I, dieresis or umlaut mark
|
|||
|
Ntilde => chr(209), # capital N, tilde
|
|||
|
Oacute => chr(211), # capital O, acute accent
|
|||
|
Ocirc => chr(212), # capital O, circumflex accent
|
|||
|
Ograve => chr(210), # capital O, grave accent
|
|||
|
Oslash => chr(216), # capital O, slash
|
|||
|
Otilde => chr(213), # capital O, tilde
|
|||
|
Ouml => chr(214), # capital O, dieresis or umlaut mark
|
|||
|
THORN => chr(222), # capital THORN, Icelandic
|
|||
|
Uacute => chr(218), # capital U, acute accent
|
|||
|
Ucirc => chr(219), # capital U, circumflex accent
|
|||
|
Ugrave => chr(217), # capital U, grave accent
|
|||
|
Uuml => chr(220), # capital U, dieresis or umlaut mark
|
|||
|
Yacute => chr(221), # capital Y, acute accent
|
|||
|
aacute => chr(225), # small a, acute accent
|
|||
|
acirc => chr(226), # small a, circumflex accent
|
|||
|
aelig => chr(230), # small ae diphthong (ligature)
|
|||
|
agrave => chr(224), # small a, grave accent
|
|||
|
aring => chr(229), # small a, ring
|
|||
|
atilde => chr(227), # small a, tilde
|
|||
|
auml => chr(228), # small a, dieresis or umlaut mark
|
|||
|
ccedil => chr(231), # small c, cedilla
|
|||
|
eacute => chr(233), # small e, acute accent
|
|||
|
ecirc => chr(234), # small e, circumflex accent
|
|||
|
egrave => chr(232), # small e, grave accent
|
|||
|
eth => chr(240), # small eth, Icelandic
|
|||
|
euml => chr(235), # small e, dieresis or umlaut mark
|
|||
|
iacute => chr(237), # small i, acute accent
|
|||
|
icirc => chr(238), # small i, circumflex accent
|
|||
|
igrave => chr(236), # small i, grave accent
|
|||
|
iuml => chr(239), # small i, dieresis or umlaut mark
|
|||
|
ntilde => chr(241), # small n, tilde
|
|||
|
oacute => chr(243), # small o, acute accent
|
|||
|
ocirc => chr(244), # small o, circumflex accent
|
|||
|
ograve => chr(242), # small o, grave accent
|
|||
|
oslash => chr(248), # small o, slash
|
|||
|
otilde => chr(245), # small o, tilde
|
|||
|
ouml => chr(246), # small o, dieresis or umlaut mark
|
|||
|
szlig => chr(223), # small sharp s, German (sz ligature)
|
|||
|
thorn => chr(254), # small thorn, Icelandic
|
|||
|
uacute => chr(250), # small u, acute accent
|
|||
|
ucirc => chr(251), # small u, circumflex accent
|
|||
|
ugrave => chr(249), # small u, grave accent
|
|||
|
uuml => chr(252), # small u, dieresis or umlaut mark
|
|||
|
yacute => chr(253), # small y, acute accent
|
|||
|
yuml => chr(255), # small y, dieresis or umlaut mark
|
|||
|
|
|||
|
# Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
|
|||
|
copy => chr(169), # copyright sign
|
|||
|
reg => chr(174), # registered sign
|
|||
|
nbsp => chr(160), # non breaking space
|
|||
|
|
|||
|
# Additional ISO-8859/1 entities listed in rfc1866 (section 14)
|
|||
|
iexcl => chr(161),
|
|||
|
cent => chr(162),
|
|||
|
pound => chr(163),
|
|||
|
curren => chr(164),
|
|||
|
yen => chr(165),
|
|||
|
brvbar => chr(166),
|
|||
|
sect => chr(167),
|
|||
|
uml => chr(168),
|
|||
|
ordf => chr(170),
|
|||
|
laquo => chr(171),
|
|||
|
'not' => chr(172), # not is a keyword in perl
|
|||
|
shy => chr(173),
|
|||
|
macr => chr(175),
|
|||
|
deg => chr(176),
|
|||
|
plusmn => chr(177),
|
|||
|
sup1 => chr(185),
|
|||
|
sup2 => chr(178),
|
|||
|
sup3 => chr(179),
|
|||
|
acute => chr(180),
|
|||
|
micro => chr(181),
|
|||
|
para => chr(182),
|
|||
|
middot => chr(183),
|
|||
|
cedil => chr(184),
|
|||
|
ordm => chr(186),
|
|||
|
raquo => chr(187),
|
|||
|
frac14 => chr(188),
|
|||
|
frac12 => chr(189),
|
|||
|
frac34 => chr(190),
|
|||
|
iquest => chr(191),
|
|||
|
'times' => chr(215), # times is a keyword in perl
|
|||
|
divide => chr(247),
|
|||
|
|
|||
|
( $] > 5.007 ? (
|
|||
|
'OElig;' => chr(338),
|
|||
|
'oelig;' => chr(339),
|
|||
|
'Scaron;' => chr(352),
|
|||
|
'scaron;' => chr(353),
|
|||
|
'Yuml;' => chr(376),
|
|||
|
'fnof;' => chr(402),
|
|||
|
'circ;' => chr(710),
|
|||
|
'tilde;' => chr(732),
|
|||
|
'Alpha;' => chr(913),
|
|||
|
'Beta;' => chr(914),
|
|||
|
'Gamma;' => chr(915),
|
|||
|
'Delta;' => chr(916),
|
|||
|
'Epsilon;' => chr(917),
|
|||
|
'Zeta;' => chr(918),
|
|||
|
'Eta;' => chr(919),
|
|||
|
'Theta;' => chr(920),
|
|||
|
'Iota;' => chr(921),
|
|||
|
'Kappa;' => chr(922),
|
|||
|
'Lambda;' => chr(923),
|
|||
|
'Mu;' => chr(924),
|
|||
|
'Nu;' => chr(925),
|
|||
|
'Xi;' => chr(926),
|
|||
|
'Omicron;' => chr(927),
|
|||
|
'Pi;' => chr(928),
|
|||
|
'Rho;' => chr(929),
|
|||
|
'Sigma;' => chr(931),
|
|||
|
'Tau;' => chr(932),
|
|||
|
'Upsilon;' => chr(933),
|
|||
|
'Phi;' => chr(934),
|
|||
|
'Chi;' => chr(935),
|
|||
|
'Psi;' => chr(936),
|
|||
|
'Omega;' => chr(937),
|
|||
|
'alpha;' => chr(945),
|
|||
|
'beta;' => chr(946),
|
|||
|
'gamma;' => chr(947),
|
|||
|
'delta;' => chr(948),
|
|||
|
'epsilon;' => chr(949),
|
|||
|
'zeta;' => chr(950),
|
|||
|
'eta;' => chr(951),
|
|||
|
'theta;' => chr(952),
|
|||
|
'iota;' => chr(953),
|
|||
|
'kappa;' => chr(954),
|
|||
|
'lambda;' => chr(955),
|
|||
|
'mu;' => chr(956),
|
|||
|
'nu;' => chr(957),
|
|||
|
'xi;' => chr(958),
|
|||
|
'omicron;' => chr(959),
|
|||
|
'pi;' => chr(960),
|
|||
|
'rho;' => chr(961),
|
|||
|
'sigmaf;' => chr(962),
|
|||
|
'sigma;' => chr(963),
|
|||
|
'tau;' => chr(964),
|
|||
|
'upsilon;' => chr(965),
|
|||
|
'phi;' => chr(966),
|
|||
|
'chi;' => chr(967),
|
|||
|
'psi;' => chr(968),
|
|||
|
'omega;' => chr(969),
|
|||
|
'thetasym;' => chr(977),
|
|||
|
'upsih;' => chr(978),
|
|||
|
'piv;' => chr(982),
|
|||
|
'ensp;' => chr(8194),
|
|||
|
'emsp;' => chr(8195),
|
|||
|
'thinsp;' => chr(8201),
|
|||
|
'zwnj;' => chr(8204),
|
|||
|
'zwj;' => chr(8205),
|
|||
|
'lrm;' => chr(8206),
|
|||
|
'rlm;' => chr(8207),
|
|||
|
'ndash;' => chr(8211),
|
|||
|
'mdash;' => chr(8212),
|
|||
|
'lsquo;' => chr(8216),
|
|||
|
'rsquo;' => chr(8217),
|
|||
|
'sbquo;' => chr(8218),
|
|||
|
'ldquo;' => chr(8220),
|
|||
|
'rdquo;' => chr(8221),
|
|||
|
'bdquo;' => chr(8222),
|
|||
|
'dagger;' => chr(8224),
|
|||
|
'Dagger;' => chr(8225),
|
|||
|
'bull;' => chr(8226),
|
|||
|
'hellip;' => chr(8230),
|
|||
|
'permil;' => chr(8240),
|
|||
|
'prime;' => chr(8242),
|
|||
|
'Prime;' => chr(8243),
|
|||
|
'lsaquo;' => chr(8249),
|
|||
|
'rsaquo;' => chr(8250),
|
|||
|
'oline;' => chr(8254),
|
|||
|
'frasl;' => chr(8260),
|
|||
|
'euro;' => chr(8364),
|
|||
|
'image;' => chr(8465),
|
|||
|
'weierp;' => chr(8472),
|
|||
|
'real;' => chr(8476),
|
|||
|
'trade;' => chr(8482),
|
|||
|
'alefsym;' => chr(8501),
|
|||
|
'larr;' => chr(8592),
|
|||
|
'uarr;' => chr(8593),
|
|||
|
'rarr;' => chr(8594),
|
|||
|
'darr;' => chr(8595),
|
|||
|
'harr;' => chr(8596),
|
|||
|
'crarr;' => chr(8629),
|
|||
|
'lArr;' => chr(8656),
|
|||
|
'uArr;' => chr(8657),
|
|||
|
'rArr;' => chr(8658),
|
|||
|
'dArr;' => chr(8659),
|
|||
|
'hArr;' => chr(8660),
|
|||
|
'forall;' => chr(8704),
|
|||
|
'part;' => chr(8706),
|
|||
|
'exist;' => chr(8707),
|
|||
|
'empty;' => chr(8709),
|
|||
|
'nabla;' => chr(8711),
|
|||
|
'isin;' => chr(8712),
|
|||
|
'notin;' => chr(8713),
|
|||
|
'ni;' => chr(8715),
|
|||
|
'prod;' => chr(8719),
|
|||
|
'sum;' => chr(8721),
|
|||
|
'minus;' => chr(8722),
|
|||
|
'lowast;' => chr(8727),
|
|||
|
'radic;' => chr(8730),
|
|||
|
'prop;' => chr(8733),
|
|||
|
'infin;' => chr(8734),
|
|||
|
'ang;' => chr(8736),
|
|||
|
'and;' => chr(8743),
|
|||
|
'or;' => chr(8744),
|
|||
|
'cap;' => chr(8745),
|
|||
|
'cup;' => chr(8746),
|
|||
|
'int;' => chr(8747),
|
|||
|
'there4;' => chr(8756),
|
|||
|
'sim;' => chr(8764),
|
|||
|
'cong;' => chr(8773),
|
|||
|
'asymp;' => chr(8776),
|
|||
|
'ne;' => chr(8800),
|
|||
|
'equiv;' => chr(8801),
|
|||
|
'le;' => chr(8804),
|
|||
|
'ge;' => chr(8805),
|
|||
|
'sub;' => chr(8834),
|
|||
|
'sup;' => chr(8835),
|
|||
|
'nsub;' => chr(8836),
|
|||
|
'sube;' => chr(8838),
|
|||
|
'supe;' => chr(8839),
|
|||
|
'oplus;' => chr(8853),
|
|||
|
'otimes;' => chr(8855),
|
|||
|
'perp;' => chr(8869),
|
|||
|
'sdot;' => chr(8901),
|
|||
|
'lceil;' => chr(8968),
|
|||
|
'rceil;' => chr(8969),
|
|||
|
'lfloor;' => chr(8970),
|
|||
|
'rfloor;' => chr(8971),
|
|||
|
'lang;' => chr(9001),
|
|||
|
'rang;' => chr(9002),
|
|||
|
'loz;' => chr(9674),
|
|||
|
'spades;' => chr(9824),
|
|||
|
'clubs;' => chr(9827),
|
|||
|
'hearts;' => chr(9829),
|
|||
|
'diams;' => chr(9830),
|
|||
|
) : ())
|
|||
|
);
|
|||
|
|
|||
|
|
|||
|
# Make the opposite mapping
|
|||
|
while (my($entity, $char) = each(%entity2char)) {
|
|||
|
$entity =~ s/;\z//;
|
|||
|
$char2entity{$char} = "&$entity;";
|
|||
|
}
|
|||
|
delete $char2entity{"'"}; # only one-way decoding
|
|||
|
|
|||
|
# Fill in missing entities
|
|||
|
for (0 .. 255) {
|
|||
|
next if exists $char2entity{chr($_)};
|
|||
|
$char2entity{chr($_)} = "&#$_;";
|
|||
|
}
|
|||
|
|
|||
|
my %subst; # compiled encoding regexps
|
|||
|
|
|||
|
sub encode_entities
|
|||
|
{
|
|||
|
return undef unless defined $_[0];
|
|||
|
my $ref;
|
|||
|
if (defined wantarray) {
|
|||
|
my $x = $_[0];
|
|||
|
$ref = \$x; # copy
|
|||
|
} else {
|
|||
|
$ref = \$_[0]; # modify in-place
|
|||
|
}
|
|||
|
if (defined $_[1] and length $_[1]) {
|
|||
|
unless (exists $subst{$_[1]}) {
|
|||
|
# Because we can't compile regex we fake it with a cached sub
|
|||
|
my $chars = $_[1];
|
|||
|
$chars =~ s,(?<!\\)([]/]),\\$1,g;
|
|||
|
$chars =~ s,(?<!\\)\\\z,\\\\,;
|
|||
|
my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
|
|||
|
$subst{$_[1]} = eval $code;
|
|||
|
die( $@ . " while trying to turn range: \"$_[1]\"\n "
|
|||
|
. "into code: $code\n "
|
|||
|
) if $@;
|
|||
|
}
|
|||
|
&{$subst{$_[1]}}($$ref);
|
|||
|
} else {
|
|||
|
# Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
|
|||
|
$$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
|
|||
|
}
|
|||
|
$$ref;
|
|||
|
}
|
|||
|
|
|||
|
sub encode_entities_numeric {
|
|||
|
local %char2entity;
|
|||
|
return &encode_entities; # a goto &encode_entities wouldn't work
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
sub num_entity {
|
|||
|
sprintf "&#x%X;", ord($_[0]);
|
|||
|
}
|
|||
|
|
|||
|
# Set up aliases
|
|||
|
*encode = \&encode_entities;
|
|||
|
*encode_numeric = \&encode_entities_numeric;
|
|||
|
*encode_numerically = \&encode_entities_numeric;
|
|||
|
*decode = \&decode_entities;
|
|||
|
|
|||
|
1;
|