484 lines
15 KiB
Perl
484 lines
15 KiB
Perl
package HTML::Entities;
|
||
|
||
=encoding utf8
|
||
|
||
=head1 NAME
|
||
|
||
HTML::Entities - Encode or decode strings with HTML entities
|
||
|
||
=head1 SYNOPSIS
|
||
|
||
use HTML::Entities;
|
||
|
||
$a = "Våre norske tegn bør æres";
|
||
decode_entities($a);
|
||
encode_entities($a, "\200-\377");
|
||
|
||
For example, this:
|
||
|
||
$input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
|
||
print encode_entities($input), "\n"
|
||
|
||
Prints this out:
|
||
|
||
vis-à-vis Beyoncé's naïve
|
||
papier-mâché résumé
|
||
|
||
=head1 DESCRIPTION
|
||
|
||
This module deals with encoding and decoding of strings with HTML
|
||
character entities. The module provides the following functions:
|
||
|
||
=over 4
|
||
|
||
=item decode_entities( $string, ... )
|
||
|
||
This routine replaces HTML entities found in the $string with the
|
||
corresponding Unicode character. Unrecognized entities are left alone.
|
||
|
||
If multiple strings are provided as argument they are each decoded
|
||
separately and the same number of strings are returned.
|
||
|
||
If called in void context the arguments are decoded in-place.
|
||
|
||
This routine is exported by default.
|
||
|
||
=item _decode_entities( $string, \%entity2char )
|
||
|
||
=item _decode_entities( $string, \%entity2char, $expand_prefix )
|
||
|
||
This will in-place replace HTML entities in $string. The %entity2char
|
||
hash must be provided. Named entities not found in the %entity2char
|
||
hash are left alone. Numeric entities are expanded unless their value
|
||
overflow.
|
||
|
||
The keys in %entity2char are the entity names to be expanded and their
|
||
values are what they should expand into. The values do not have to be
|
||
single character strings. If a key has ";" as suffix,
|
||
then occurrences in $string are only expanded if properly terminated
|
||
with ";". Entities without ";" will be expanded regardless of how
|
||
they are terminated for compatibility with how common browsers treat
|
||
entities in the Latin-1 range.
|
||
|
||
If $expand_prefix is TRUE then entities without trailing ";" in
|
||
%entity2char will even be expanded as a prefix of a longer
|
||
unrecognized name. The longest matching name in %entity2char will be
|
||
used. This is mainly present for compatibility with an MSIE
|
||
misfeature.
|
||
|
||
$string = "foo bar";
|
||
_decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
|
||
print $string; # will print "foo bar"
|
||
|
||
This routine is exported by default.
|
||
|
||
=item encode_entities( $string )
|
||
|
||
=item encode_entities( $string, $unsafe_chars )
|
||
|
||
This routine replaces unsafe characters in $string with their entity
|
||
representation. A second argument can be given to specify which characters to
|
||
consider unsafe. The unsafe characters is specified using the regular
|
||
expression character class syntax (what you find within brackets in regular
|
||
expressions).
|
||
|
||
The default set of characters to encode are control chars, high-bit chars, and
|
||
the C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> characters. But this,
|
||
for example, would encode I<just> the C<< < >>, C<< & >>, C<< > >>, and C<< "
|
||
>> characters:
|
||
|
||
$encoded = encode_entities($input, '<>&"');
|
||
|
||
and this would only encode non-plain ascii:
|
||
|
||
$encoded = encode_entities($input, '^\n\x20-\x25\x27-\x7e');
|
||
|
||
This routine is exported by default.
|
||
|
||
=item encode_entities_numeric( $string )
|
||
|
||
=item encode_entities_numeric( $string, $unsafe_chars )
|
||
|
||
This routine works just like encode_entities, except that the replacement
|
||
entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For
|
||
example, C<encode_entities("r\xF4le")> returns "rôle", but
|
||
C<encode_entities_numeric("r\xF4le")> returns "rôle".
|
||
|
||
This routine is I<not> exported by default. But you can always
|
||
export it with C<use HTML::Entities qw(encode_entities_numeric);>
|
||
or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
|
||
|
||
=back
|
||
|
||
All these routines modify the string passed as the first argument, if
|
||
called in a void context. In scalar and array contexts, the encoded or
|
||
decoded string is returned (without changing the input string).
|
||
|
||
If you prefer not to import these routines into your namespace, you can
|
||
call them as:
|
||
|
||
use HTML::Entities ();
|
||
$decoded = HTML::Entities::decode($a);
|
||
$encoded = HTML::Entities::encode($a);
|
||
$encoded = HTML::Entities::encode_numeric($a);
|
||
|
||
The module can also export the %char2entity and the %entity2char
|
||
hashes, which contain the mapping from all characters to the
|
||
corresponding entities (and vice versa, respectively).
|
||
|
||
=head1 COPYRIGHT
|
||
|
||
Copyright 1995-2006 Gisle Aas. All rights reserved.
|
||
|
||
This library is free software; you can redistribute it and/or
|
||
modify it under the same terms as Perl itself.
|
||
|
||
=cut
|
||
|
||
use strict;
|
||
use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
|
||
use vars qw(%entity2char %char2entity);
|
||
|
||
require 5.004;
|
||
require Exporter;
|
||
@ISA = qw(Exporter);
|
||
|
||
@EXPORT = qw(encode_entities decode_entities _decode_entities);
|
||
@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
|
||
|
||
$VERSION = "3.69";
|
||
sub Version { $VERSION; }
|
||
|
||
require HTML::Parser; # for fast XS implemented decode_entities
|
||
|
||
|
||
%entity2char = (
|
||
# Some normal chars that have special meaning in SGML context
|
||
amp => '&', # ampersand
|
||
'gt' => '>', # greater than
|
||
'lt' => '<', # less than
|
||
quot => '"', # double quote
|
||
apos => "'", # single quote
|
||
|
||
# PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
|
||
AElig => chr(198), # capital AE diphthong (ligature)
|
||
Aacute => chr(193), # capital A, acute accent
|
||
Acirc => chr(194), # capital A, circumflex accent
|
||
Agrave => chr(192), # capital A, grave accent
|
||
Aring => chr(197), # capital A, ring
|
||
Atilde => chr(195), # capital A, tilde
|
||
Auml => chr(196), # capital A, dieresis or umlaut mark
|
||
Ccedil => chr(199), # capital C, cedilla
|
||
ETH => chr(208), # capital Eth, Icelandic
|
||
Eacute => chr(201), # capital E, acute accent
|
||
Ecirc => chr(202), # capital E, circumflex accent
|
||
Egrave => chr(200), # capital E, grave accent
|
||
Euml => chr(203), # capital E, dieresis or umlaut mark
|
||
Iacute => chr(205), # capital I, acute accent
|
||
Icirc => chr(206), # capital I, circumflex accent
|
||
Igrave => chr(204), # capital I, grave accent
|
||
Iuml => chr(207), # capital I, dieresis or umlaut mark
|
||
Ntilde => chr(209), # capital N, tilde
|
||
Oacute => chr(211), # capital O, acute accent
|
||
Ocirc => chr(212), # capital O, circumflex accent
|
||
Ograve => chr(210), # capital O, grave accent
|
||
Oslash => chr(216), # capital O, slash
|
||
Otilde => chr(213), # capital O, tilde
|
||
Ouml => chr(214), # capital O, dieresis or umlaut mark
|
||
THORN => chr(222), # capital THORN, Icelandic
|
||
Uacute => chr(218), # capital U, acute accent
|
||
Ucirc => chr(219), # capital U, circumflex accent
|
||
Ugrave => chr(217), # capital U, grave accent
|
||
Uuml => chr(220), # capital U, dieresis or umlaut mark
|
||
Yacute => chr(221), # capital Y, acute accent
|
||
aacute => chr(225), # small a, acute accent
|
||
acirc => chr(226), # small a, circumflex accent
|
||
aelig => chr(230), # small ae diphthong (ligature)
|
||
agrave => chr(224), # small a, grave accent
|
||
aring => chr(229), # small a, ring
|
||
atilde => chr(227), # small a, tilde
|
||
auml => chr(228), # small a, dieresis or umlaut mark
|
||
ccedil => chr(231), # small c, cedilla
|
||
eacute => chr(233), # small e, acute accent
|
||
ecirc => chr(234), # small e, circumflex accent
|
||
egrave => chr(232), # small e, grave accent
|
||
eth => chr(240), # small eth, Icelandic
|
||
euml => chr(235), # small e, dieresis or umlaut mark
|
||
iacute => chr(237), # small i, acute accent
|
||
icirc => chr(238), # small i, circumflex accent
|
||
igrave => chr(236), # small i, grave accent
|
||
iuml => chr(239), # small i, dieresis or umlaut mark
|
||
ntilde => chr(241), # small n, tilde
|
||
oacute => chr(243), # small o, acute accent
|
||
ocirc => chr(244), # small o, circumflex accent
|
||
ograve => chr(242), # small o, grave accent
|
||
oslash => chr(248), # small o, slash
|
||
otilde => chr(245), # small o, tilde
|
||
ouml => chr(246), # small o, dieresis or umlaut mark
|
||
szlig => chr(223), # small sharp s, German (sz ligature)
|
||
thorn => chr(254), # small thorn, Icelandic
|
||
uacute => chr(250), # small u, acute accent
|
||
ucirc => chr(251), # small u, circumflex accent
|
||
ugrave => chr(249), # small u, grave accent
|
||
uuml => chr(252), # small u, dieresis or umlaut mark
|
||
yacute => chr(253), # small y, acute accent
|
||
yuml => chr(255), # small y, dieresis or umlaut mark
|
||
|
||
# Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
|
||
copy => chr(169), # copyright sign
|
||
reg => chr(174), # registered sign
|
||
nbsp => chr(160), # non breaking space
|
||
|
||
# Additional ISO-8859/1 entities listed in rfc1866 (section 14)
|
||
iexcl => chr(161),
|
||
cent => chr(162),
|
||
pound => chr(163),
|
||
curren => chr(164),
|
||
yen => chr(165),
|
||
brvbar => chr(166),
|
||
sect => chr(167),
|
||
uml => chr(168),
|
||
ordf => chr(170),
|
||
laquo => chr(171),
|
||
'not' => chr(172), # not is a keyword in perl
|
||
shy => chr(173),
|
||
macr => chr(175),
|
||
deg => chr(176),
|
||
plusmn => chr(177),
|
||
sup1 => chr(185),
|
||
sup2 => chr(178),
|
||
sup3 => chr(179),
|
||
acute => chr(180),
|
||
micro => chr(181),
|
||
para => chr(182),
|
||
middot => chr(183),
|
||
cedil => chr(184),
|
||
ordm => chr(186),
|
||
raquo => chr(187),
|
||
frac14 => chr(188),
|
||
frac12 => chr(189),
|
||
frac34 => chr(190),
|
||
iquest => chr(191),
|
||
'times' => chr(215), # times is a keyword in perl
|
||
divide => chr(247),
|
||
|
||
( $] > 5.007 ? (
|
||
'OElig;' => chr(338),
|
||
'oelig;' => chr(339),
|
||
'Scaron;' => chr(352),
|
||
'scaron;' => chr(353),
|
||
'Yuml;' => chr(376),
|
||
'fnof;' => chr(402),
|
||
'circ;' => chr(710),
|
||
'tilde;' => chr(732),
|
||
'Alpha;' => chr(913),
|
||
'Beta;' => chr(914),
|
||
'Gamma;' => chr(915),
|
||
'Delta;' => chr(916),
|
||
'Epsilon;' => chr(917),
|
||
'Zeta;' => chr(918),
|
||
'Eta;' => chr(919),
|
||
'Theta;' => chr(920),
|
||
'Iota;' => chr(921),
|
||
'Kappa;' => chr(922),
|
||
'Lambda;' => chr(923),
|
||
'Mu;' => chr(924),
|
||
'Nu;' => chr(925),
|
||
'Xi;' => chr(926),
|
||
'Omicron;' => chr(927),
|
||
'Pi;' => chr(928),
|
||
'Rho;' => chr(929),
|
||
'Sigma;' => chr(931),
|
||
'Tau;' => chr(932),
|
||
'Upsilon;' => chr(933),
|
||
'Phi;' => chr(934),
|
||
'Chi;' => chr(935),
|
||
'Psi;' => chr(936),
|
||
'Omega;' => chr(937),
|
||
'alpha;' => chr(945),
|
||
'beta;' => chr(946),
|
||
'gamma;' => chr(947),
|
||
'delta;' => chr(948),
|
||
'epsilon;' => chr(949),
|
||
'zeta;' => chr(950),
|
||
'eta;' => chr(951),
|
||
'theta;' => chr(952),
|
||
'iota;' => chr(953),
|
||
'kappa;' => chr(954),
|
||
'lambda;' => chr(955),
|
||
'mu;' => chr(956),
|
||
'nu;' => chr(957),
|
||
'xi;' => chr(958),
|
||
'omicron;' => chr(959),
|
||
'pi;' => chr(960),
|
||
'rho;' => chr(961),
|
||
'sigmaf;' => chr(962),
|
||
'sigma;' => chr(963),
|
||
'tau;' => chr(964),
|
||
'upsilon;' => chr(965),
|
||
'phi;' => chr(966),
|
||
'chi;' => chr(967),
|
||
'psi;' => chr(968),
|
||
'omega;' => chr(969),
|
||
'thetasym;' => chr(977),
|
||
'upsih;' => chr(978),
|
||
'piv;' => chr(982),
|
||
'ensp;' => chr(8194),
|
||
'emsp;' => chr(8195),
|
||
'thinsp;' => chr(8201),
|
||
'zwnj;' => chr(8204),
|
||
'zwj;' => chr(8205),
|
||
'lrm;' => chr(8206),
|
||
'rlm;' => chr(8207),
|
||
'ndash;' => chr(8211),
|
||
'mdash;' => chr(8212),
|
||
'lsquo;' => chr(8216),
|
||
'rsquo;' => chr(8217),
|
||
'sbquo;' => chr(8218),
|
||
'ldquo;' => chr(8220),
|
||
'rdquo;' => chr(8221),
|
||
'bdquo;' => chr(8222),
|
||
'dagger;' => chr(8224),
|
||
'Dagger;' => chr(8225),
|
||
'bull;' => chr(8226),
|
||
'hellip;' => chr(8230),
|
||
'permil;' => chr(8240),
|
||
'prime;' => chr(8242),
|
||
'Prime;' => chr(8243),
|
||
'lsaquo;' => chr(8249),
|
||
'rsaquo;' => chr(8250),
|
||
'oline;' => chr(8254),
|
||
'frasl;' => chr(8260),
|
||
'euro;' => chr(8364),
|
||
'image;' => chr(8465),
|
||
'weierp;' => chr(8472),
|
||
'real;' => chr(8476),
|
||
'trade;' => chr(8482),
|
||
'alefsym;' => chr(8501),
|
||
'larr;' => chr(8592),
|
||
'uarr;' => chr(8593),
|
||
'rarr;' => chr(8594),
|
||
'darr;' => chr(8595),
|
||
'harr;' => chr(8596),
|
||
'crarr;' => chr(8629),
|
||
'lArr;' => chr(8656),
|
||
'uArr;' => chr(8657),
|
||
'rArr;' => chr(8658),
|
||
'dArr;' => chr(8659),
|
||
'hArr;' => chr(8660),
|
||
'forall;' => chr(8704),
|
||
'part;' => chr(8706),
|
||
'exist;' => chr(8707),
|
||
'empty;' => chr(8709),
|
||
'nabla;' => chr(8711),
|
||
'isin;' => chr(8712),
|
||
'notin;' => chr(8713),
|
||
'ni;' => chr(8715),
|
||
'prod;' => chr(8719),
|
||
'sum;' => chr(8721),
|
||
'minus;' => chr(8722),
|
||
'lowast;' => chr(8727),
|
||
'radic;' => chr(8730),
|
||
'prop;' => chr(8733),
|
||
'infin;' => chr(8734),
|
||
'ang;' => chr(8736),
|
||
'and;' => chr(8743),
|
||
'or;' => chr(8744),
|
||
'cap;' => chr(8745),
|
||
'cup;' => chr(8746),
|
||
'int;' => chr(8747),
|
||
'there4;' => chr(8756),
|
||
'sim;' => chr(8764),
|
||
'cong;' => chr(8773),
|
||
'asymp;' => chr(8776),
|
||
'ne;' => chr(8800),
|
||
'equiv;' => chr(8801),
|
||
'le;' => chr(8804),
|
||
'ge;' => chr(8805),
|
||
'sub;' => chr(8834),
|
||
'sup;' => chr(8835),
|
||
'nsub;' => chr(8836),
|
||
'sube;' => chr(8838),
|
||
'supe;' => chr(8839),
|
||
'oplus;' => chr(8853),
|
||
'otimes;' => chr(8855),
|
||
'perp;' => chr(8869),
|
||
'sdot;' => chr(8901),
|
||
'lceil;' => chr(8968),
|
||
'rceil;' => chr(8969),
|
||
'lfloor;' => chr(8970),
|
||
'rfloor;' => chr(8971),
|
||
'lang;' => chr(9001),
|
||
'rang;' => chr(9002),
|
||
'loz;' => chr(9674),
|
||
'spades;' => chr(9824),
|
||
'clubs;' => chr(9827),
|
||
'hearts;' => chr(9829),
|
||
'diams;' => chr(9830),
|
||
) : ())
|
||
);
|
||
|
||
|
||
# Make the opposite mapping
|
||
while (my($entity, $char) = each(%entity2char)) {
|
||
$entity =~ s/;\z//;
|
||
$char2entity{$char} = "&$entity;";
|
||
}
|
||
delete $char2entity{"'"}; # only one-way decoding
|
||
|
||
# Fill in missing entities
|
||
for (0 .. 255) {
|
||
next if exists $char2entity{chr($_)};
|
||
$char2entity{chr($_)} = "&#$_;";
|
||
}
|
||
|
||
my %subst; # compiled encoding regexps
|
||
|
||
sub encode_entities
|
||
{
|
||
return undef unless defined $_[0];
|
||
my $ref;
|
||
if (defined wantarray) {
|
||
my $x = $_[0];
|
||
$ref = \$x; # copy
|
||
} else {
|
||
$ref = \$_[0]; # modify in-place
|
||
}
|
||
if (defined $_[1] and length $_[1]) {
|
||
unless (exists $subst{$_[1]}) {
|
||
# Because we can't compile regex we fake it with a cached sub
|
||
my $chars = $_[1];
|
||
$chars =~ s,(?<!\\)([]/]),\\$1,g;
|
||
$chars =~ s,(?<!\\)\\\z,\\\\,;
|
||
my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
|
||
$subst{$_[1]} = eval $code;
|
||
die( $@ . " while trying to turn range: \"$_[1]\"\n "
|
||
. "into code: $code\n "
|
||
) if $@;
|
||
}
|
||
&{$subst{$_[1]}}($$ref);
|
||
} else {
|
||
# Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
|
||
$$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
|
||
}
|
||
$$ref;
|
||
}
|
||
|
||
sub encode_entities_numeric {
|
||
local %char2entity;
|
||
return &encode_entities; # a goto &encode_entities wouldn't work
|
||
}
|
||
|
||
|
||
sub num_entity {
|
||
sprintf "&#x%X;", ord($_[0]);
|
||
}
|
||
|
||
# Set up aliases
|
||
*encode = \&encode_entities;
|
||
*encode_numeric = \&encode_entities_numeric;
|
||
*encode_numerically = \&encode_entities_numeric;
|
||
*decode = \&decode_entities;
|
||
|
||
1;
|