374 lines
12 KiB
Perl
374 lines
12 KiB
Perl
package Encode::Locale;
|
|
|
|
use strict;
|
|
our $VERSION = "1.05";
|
|
|
|
use base 'Exporter';
|
|
our @EXPORT_OK = qw(
|
|
decode_argv env
|
|
$ENCODING_LOCALE $ENCODING_LOCALE_FS
|
|
$ENCODING_CONSOLE_IN $ENCODING_CONSOLE_OUT
|
|
);
|
|
|
|
use Encode ();
|
|
use Encode::Alias ();
|
|
|
|
our $ENCODING_LOCALE;
|
|
our $ENCODING_LOCALE_FS;
|
|
our $ENCODING_CONSOLE_IN;
|
|
our $ENCODING_CONSOLE_OUT;
|
|
|
|
sub DEBUG () { 0 }
|
|
|
|
sub _init {
|
|
if ($^O eq "MSWin32") {
|
|
unless ($ENCODING_LOCALE) {
|
|
# Try to obtain what the Windows ANSI code page is
|
|
eval {
|
|
unless (defined &GetACP) {
|
|
require Win32;
|
|
eval { Win32::GetACP() };
|
|
*GetACP = sub { &Win32::GetACP } unless $@;
|
|
}
|
|
unless (defined &GetACP) {
|
|
require Win32::API;
|
|
Win32::API->Import('kernel32', 'int GetACP()');
|
|
}
|
|
if (defined &GetACP) {
|
|
my $cp = GetACP();
|
|
$ENCODING_LOCALE = "cp$cp" if $cp;
|
|
}
|
|
};
|
|
}
|
|
|
|
unless ($ENCODING_CONSOLE_IN) {
|
|
# only test one since set together
|
|
unless (defined &GetInputCP) {
|
|
eval {
|
|
require Win32;
|
|
eval { Win32::GetConsoleCP() };
|
|
# manually "import" it since Win32->import refuses
|
|
*GetInputCP = sub { &Win32::GetConsoleCP } unless $@;
|
|
*GetOutputCP = sub { &Win32::GetConsoleOutputCP } unless $@;
|
|
};
|
|
unless (defined &GetInputCP) {
|
|
eval {
|
|
# try Win32::Console module for codepage to use
|
|
require Win32::Console;
|
|
eval { Win32::Console::InputCP() };
|
|
*GetInputCP = sub { &Win32::Console::InputCP }
|
|
unless $@;
|
|
*GetOutputCP = sub { &Win32::Console::OutputCP }
|
|
unless $@;
|
|
};
|
|
}
|
|
unless (defined &GetInputCP) {
|
|
# final fallback
|
|
*GetInputCP = *GetOutputCP = sub {
|
|
# another fallback that could work is:
|
|
# reg query HKLM\System\CurrentControlSet\Control\Nls\CodePage /v ACP
|
|
((qx(chcp) || '') =~ /^Active code page: (\d+)/)
|
|
? $1 : ();
|
|
};
|
|
}
|
|
}
|
|
my $cp = GetInputCP();
|
|
$ENCODING_CONSOLE_IN = "cp$cp" if $cp;
|
|
$cp = GetOutputCP();
|
|
$ENCODING_CONSOLE_OUT = "cp$cp" if $cp;
|
|
}
|
|
}
|
|
|
|
unless ($ENCODING_LOCALE) {
|
|
eval {
|
|
require I18N::Langinfo;
|
|
$ENCODING_LOCALE = I18N::Langinfo::langinfo(I18N::Langinfo::CODESET());
|
|
|
|
# Workaround of Encode < v2.25. The "646" encoding alias was
|
|
# introduced in Encode-2.25, but we don't want to require that version
|
|
# quite yet. Should avoid the CPAN testers failure reported from
|
|
# openbsd-4.7/perl-5.10.0 combo.
|
|
$ENCODING_LOCALE = "ascii" if $ENCODING_LOCALE eq "646";
|
|
|
|
# https://rt.cpan.org/Ticket/Display.html?id=66373
|
|
$ENCODING_LOCALE = "hp-roman8" if $^O eq "hpux" && $ENCODING_LOCALE eq "roman8";
|
|
};
|
|
$ENCODING_LOCALE ||= $ENCODING_CONSOLE_IN;
|
|
}
|
|
|
|
if ($^O eq "darwin") {
|
|
$ENCODING_LOCALE_FS ||= "UTF-8";
|
|
}
|
|
|
|
# final fallback
|
|
$ENCODING_LOCALE ||= $^O eq "MSWin32" ? "cp1252" : "UTF-8";
|
|
$ENCODING_LOCALE_FS ||= $ENCODING_LOCALE;
|
|
$ENCODING_CONSOLE_IN ||= $ENCODING_LOCALE;
|
|
$ENCODING_CONSOLE_OUT ||= $ENCODING_CONSOLE_IN;
|
|
|
|
unless (Encode::find_encoding($ENCODING_LOCALE)) {
|
|
my $foundit;
|
|
if (lc($ENCODING_LOCALE) eq "gb18030") {
|
|
eval {
|
|
require Encode::HanExtra;
|
|
};
|
|
if ($@) {
|
|
die "Need Encode::HanExtra to be installed to support locale codeset ($ENCODING_LOCALE), stopped";
|
|
}
|
|
$foundit++ if Encode::find_encoding($ENCODING_LOCALE);
|
|
}
|
|
die "The locale codeset ($ENCODING_LOCALE) isn't one that perl can decode, stopped"
|
|
unless $foundit;
|
|
|
|
}
|
|
|
|
# use Data::Dump; ddx $ENCODING_LOCALE, $ENCODING_LOCALE_FS, $ENCODING_CONSOLE_IN, $ENCODING_CONSOLE_OUT;
|
|
}
|
|
|
|
_init();
|
|
Encode::Alias::define_alias(sub {
|
|
no strict 'refs';
|
|
no warnings 'once';
|
|
return ${"ENCODING_" . uc(shift)};
|
|
}, "locale");
|
|
|
|
sub _flush_aliases {
|
|
no strict 'refs';
|
|
for my $a (keys %Encode::Alias::Alias) {
|
|
if (defined ${"ENCODING_" . uc($a)}) {
|
|
delete $Encode::Alias::Alias{$a};
|
|
warn "Flushed alias cache for $a" if DEBUG;
|
|
}
|
|
}
|
|
}
|
|
|
|
sub reinit {
|
|
$ENCODING_LOCALE = shift;
|
|
$ENCODING_LOCALE_FS = shift;
|
|
$ENCODING_CONSOLE_IN = $ENCODING_LOCALE;
|
|
$ENCODING_CONSOLE_OUT = $ENCODING_LOCALE;
|
|
_init();
|
|
_flush_aliases();
|
|
}
|
|
|
|
sub decode_argv {
|
|
die if defined wantarray;
|
|
for (@ARGV) {
|
|
$_ = Encode::decode(locale => $_, @_);
|
|
}
|
|
}
|
|
|
|
sub env {
|
|
my $k = Encode::encode(locale => shift);
|
|
my $old = $ENV{$k};
|
|
if (@_) {
|
|
my $v = shift;
|
|
if (defined $v) {
|
|
$ENV{$k} = Encode::encode(locale => $v);
|
|
}
|
|
else {
|
|
delete $ENV{$k};
|
|
}
|
|
}
|
|
return Encode::decode(locale => $old) if defined wantarray;
|
|
}
|
|
|
|
1;
|
|
|
|
__END__
|
|
|
|
=head1 NAME
|
|
|
|
Encode::Locale - Determine the locale encoding
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
use Encode::Locale;
|
|
use Encode;
|
|
|
|
$string = decode(locale => $bytes);
|
|
$bytes = encode(locale => $string);
|
|
|
|
if (-t) {
|
|
binmode(STDIN, ":encoding(console_in)");
|
|
binmode(STDOUT, ":encoding(console_out)");
|
|
binmode(STDERR, ":encoding(console_out)");
|
|
}
|
|
|
|
# Processing file names passed in as arguments
|
|
my $uni_filename = decode(locale => $ARGV[0]);
|
|
open(my $fh, "<", encode(locale_fs => $uni_filename))
|
|
|| die "Can't open '$uni_filename': $!";
|
|
binmode($fh, ":encoding(locale)");
|
|
...
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
In many applications it's wise to let Perl use Unicode for the strings it
|
|
processes. Most of the interfaces Perl has to the outside world are still byte
|
|
based. Programs therefore need to decode byte strings that enter the program
|
|
from the outside and encode them again on the way out.
|
|
|
|
The POSIX locale system is used to specify both the language conventions
|
|
requested by the user and the preferred character set to consume and
|
|
output. The C<Encode::Locale> module looks up the charset and encoding (called
|
|
a CODESET in the locale jargon) and arranges for the L<Encode> module to know
|
|
this encoding under the name "locale". It means bytes obtained from the
|
|
environment can be converted to Unicode strings by calling C<<
|
|
Encode::encode(locale => $bytes) >> and converted back again with C<<
|
|
Encode::decode(locale => $string) >>.
|
|
|
|
Where file systems interfaces pass file names in and out of the program we also
|
|
need care. The trend is for operating systems to use a fixed file encoding
|
|
that don't actually depend on the locale; and this module determines the most
|
|
appropriate encoding for file names. The L<Encode> module will know this
|
|
encoding under the name "locale_fs". For traditional Unix systems this will
|
|
be an alias to the same encoding as "locale".
|
|
|
|
For programs running in a terminal window (called a "Console" on some systems)
|
|
the "locale" encoding is usually a good choice for what to expect as input and
|
|
output. Some systems allows us to query the encoding set for the terminal and
|
|
C<Encode::Locale> will do that if available and make these encodings known
|
|
under the C<Encode> aliases "console_in" and "console_out". For systems where
|
|
we can't determine the terminal encoding these will be aliased as the same
|
|
encoding as "locale". The advice is to use "console_in" for input known to
|
|
come from the terminal and "console_out" for output to the terminal.
|
|
|
|
In addition to arranging for various Encode aliases the following functions and
|
|
variables are provided:
|
|
|
|
=over
|
|
|
|
=item decode_argv( )
|
|
|
|
=item decode_argv( Encode::FB_CROAK )
|
|
|
|
This will decode the command line arguments to perl (the C<@ARGV> array) in-place.
|
|
|
|
The function will by default replace characters that can't be decoded by
|
|
"\x{FFFD}", the Unicode replacement character.
|
|
|
|
Any argument provided is passed as CHECK to underlying Encode::decode() call.
|
|
Pass the value C<Encode::FB_CROAK> to have the decoding croak if not all the
|
|
command line arguments can be decoded. See L<Encode/"Handling Malformed Data">
|
|
for details on other options for CHECK.
|
|
|
|
=item env( $uni_key )
|
|
|
|
=item env( $uni_key => $uni_value )
|
|
|
|
Interface to get/set environment variables. Returns the current value as a
|
|
Unicode string. The $uni_key and $uni_value arguments are expected to be
|
|
Unicode strings as well. Passing C<undef> as $uni_value deletes the
|
|
environment variable named $uni_key.
|
|
|
|
The returned value will have the characters that can't be decoded replaced by
|
|
"\x{FFFD}", the Unicode replacement character.
|
|
|
|
There is no interface to request alternative CHECK behavior as for
|
|
decode_argv(). If you need that you need to call encode/decode yourself.
|
|
For example:
|
|
|
|
my $key = Encode::encode(locale => $uni_key, Encode::FB_CROAK);
|
|
my $uni_value = Encode::decode(locale => $ENV{$key}, Encode::FB_CROAK);
|
|
|
|
=item reinit( )
|
|
|
|
=item reinit( $encoding )
|
|
|
|
Reinitialize the encodings from the locale. You want to call this function if
|
|
you changed anything in the environment that might influence the locale.
|
|
|
|
This function will croak if the determined encoding isn't recognized by
|
|
the Encode module.
|
|
|
|
With argument force $ENCODING_... variables to set to the given value.
|
|
|
|
=item $ENCODING_LOCALE
|
|
|
|
The encoding name determined to be suitable for the current locale.
|
|
L<Encode> know this encoding as "locale".
|
|
|
|
=item $ENCODING_LOCALE_FS
|
|
|
|
The encoding name determined to be suitable for file system interfaces
|
|
involving file names.
|
|
L<Encode> know this encoding as "locale_fs".
|
|
|
|
=item $ENCODING_CONSOLE_IN
|
|
|
|
=item $ENCODING_CONSOLE_OUT
|
|
|
|
The encodings to be used for reading and writing output to the a console.
|
|
L<Encode> know these encodings as "console_in" and "console_out".
|
|
|
|
=back
|
|
|
|
=head1 NOTES
|
|
|
|
This table summarizes the mapping of the encodings set up
|
|
by the C<Encode::Locale> module:
|
|
|
|
Encode | | |
|
|
Alias | Windows | Mac OS X | POSIX
|
|
------------+---------+--------------+------------
|
|
locale | ANSI | nl_langinfo | nl_langinfo
|
|
locale_fs | ANSI | UTF-8 | nl_langinfo
|
|
console_in | OEM | nl_langinfo | nl_langinfo
|
|
console_out | OEM | nl_langinfo | nl_langinfo
|
|
|
|
=head2 Windows
|
|
|
|
Windows has basically 2 sets of APIs. A wide API (based on passing UTF-16
|
|
strings) and a byte based API based a character set called ANSI. The
|
|
regular Perl interfaces to the OS currently only uses the ANSI APIs.
|
|
Unfortunately ANSI is not a single character set.
|
|
|
|
The encoding that corresponds to ANSI varies between different editions of
|
|
Windows. For many western editions of Windows ANSI corresponds to CP-1252
|
|
which is a character set similar to ISO-8859-1. Conceptually the ANSI
|
|
character set is a similar concept to the POSIX locale CODESET so this module
|
|
figures out what the ANSI code page is and make this available as
|
|
$ENCODING_LOCALE and the "locale" Encoding alias.
|
|
|
|
Windows systems also operate with another byte based character set.
|
|
It's called the OEM code page. This is the encoding that the Console
|
|
takes as input and output. It's common for the OEM code page to
|
|
differ from the ANSI code page.
|
|
|
|
=head2 Mac OS X
|
|
|
|
On Mac OS X the file system encoding is always UTF-8 while the locale
|
|
can otherwise be set up as normal for POSIX systems.
|
|
|
|
File names on Mac OS X will at the OS-level be converted to
|
|
NFD-form. A file created by passing a NFC-filename will come
|
|
in NFD-form from readdir(). See L<Unicode::Normalize> for details
|
|
of NFD/NFC.
|
|
|
|
Actually, Apple does not follow the Unicode NFD standard since not all
|
|
character ranges are decomposed. The claim is that this avoids problems with
|
|
round trip conversions from old Mac text encodings. See L<Encode::UTF8Mac> for
|
|
details.
|
|
|
|
=head2 POSIX (Linux and other Unixes)
|
|
|
|
File systems might vary in what encoding is to be used for
|
|
filenames. Since this module has no way to actually figure out
|
|
what the is correct it goes with the best guess which is to
|
|
assume filenames are encoding according to the current locale.
|
|
Users are advised to always specify UTF-8 as the locale charset.
|
|
|
|
=head1 SEE ALSO
|
|
|
|
L<I18N::Langinfo>, L<Encode>, L<Term::Encoding>
|
|
|
|
=head1 AUTHOR
|
|
|
|
Copyright 2010 Gisle Aas <gisle@aas.no>.
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the same terms as Perl itself.
|
|
|
|
=cut
|