313 lines
7.7 KiB
Perl
313 lines
7.7 KiB
Perl
package LWP::RobotUA;
|
|
|
|
use base qw(LWP::UserAgent);
|
|
|
|
our $VERSION = '6.36';
|
|
|
|
require WWW::RobotRules;
|
|
require HTTP::Request;
|
|
require HTTP::Response;
|
|
|
|
use Carp ();
|
|
use HTTP::Status ();
|
|
use HTTP::Date qw(time2str);
|
|
use strict;
|
|
|
|
|
|
#
|
|
# Additional attributes in addition to those found in LWP::UserAgent:
|
|
#
|
|
# $self->{'delay'} Required delay between request to the same
|
|
# server in minutes.
|
|
#
|
|
# $self->{'rules'} A WWW::RobotRules object
|
|
#
|
|
|
|
sub new
|
|
{
|
|
my $class = shift;
|
|
my %cnf;
|
|
if (@_ < 4) {
|
|
# legacy args
|
|
@cnf{qw(agent from rules)} = @_;
|
|
}
|
|
else {
|
|
%cnf = @_;
|
|
}
|
|
|
|
Carp::croak('LWP::RobotUA agent required') unless $cnf{agent};
|
|
Carp::croak('LWP::RobotUA from address required')
|
|
unless $cnf{from} && $cnf{from} =~ m/\@/;
|
|
|
|
my $delay = delete $cnf{delay} || 1;
|
|
my $use_sleep = delete $cnf{use_sleep};
|
|
$use_sleep = 1 unless defined($use_sleep);
|
|
my $rules = delete $cnf{rules};
|
|
|
|
my $self = LWP::UserAgent->new(%cnf);
|
|
$self = bless $self, $class;
|
|
|
|
$self->{'delay'} = $delay; # minutes
|
|
$self->{'use_sleep'} = $use_sleep;
|
|
|
|
if ($rules) {
|
|
$rules->agent($cnf{agent});
|
|
$self->{'rules'} = $rules;
|
|
}
|
|
else {
|
|
$self->{'rules'} = WWW::RobotRules->new($cnf{agent});
|
|
}
|
|
|
|
$self;
|
|
}
|
|
|
|
|
|
sub delay { shift->_elem('delay', @_); }
|
|
sub use_sleep { shift->_elem('use_sleep', @_); }
|
|
|
|
|
|
sub agent
|
|
{
|
|
my $self = shift;
|
|
my $old = $self->SUPER::agent(@_);
|
|
if (@_) {
|
|
# Changing our name means to start fresh
|
|
$self->{'rules'}->agent($self->{'agent'});
|
|
}
|
|
$old;
|
|
}
|
|
|
|
|
|
sub rules {
|
|
my $self = shift;
|
|
my $old = $self->_elem('rules', @_);
|
|
$self->{'rules'}->agent($self->{'agent'}) if @_;
|
|
$old;
|
|
}
|
|
|
|
|
|
sub no_visits
|
|
{
|
|
my($self, $netloc) = @_;
|
|
$self->{'rules'}->no_visits($netloc) || 0;
|
|
}
|
|
|
|
*host_count = \&no_visits; # backwards compatibility with LWP-5.02
|
|
|
|
|
|
sub host_wait
|
|
{
|
|
my($self, $netloc) = @_;
|
|
return undef unless defined $netloc;
|
|
my $last = $self->{'rules'}->last_visit($netloc);
|
|
if ($last) {
|
|
my $wait = int($self->{'delay'} * 60 - (time - $last));
|
|
$wait = 0 if $wait < 0;
|
|
return $wait;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
sub simple_request
|
|
{
|
|
my($self, $request, $arg, $size) = @_;
|
|
|
|
# Do we try to access a new server?
|
|
my $allowed = $self->{'rules'}->allowed($request->uri);
|
|
|
|
if ($allowed < 0) {
|
|
# Host is not visited before, or robots.txt expired; fetch "robots.txt"
|
|
my $robot_url = $request->uri->clone;
|
|
$robot_url->path("robots.txt");
|
|
$robot_url->query(undef);
|
|
|
|
# make access to robot.txt legal since this will be a recursive call
|
|
$self->{'rules'}->parse($robot_url, "");
|
|
|
|
my $robot_req = HTTP::Request->new('GET', $robot_url);
|
|
my $parse_head = $self->parse_head(0);
|
|
my $robot_res = $self->request($robot_req);
|
|
$self->parse_head($parse_head);
|
|
my $fresh_until = $robot_res->fresh_until;
|
|
my $content = "";
|
|
if ($robot_res->is_success && $robot_res->content_is_text) {
|
|
$content = $robot_res->decoded_content;
|
|
$content = "" unless $content && $content =~ /^\s*Disallow\s*:/mi;
|
|
}
|
|
$self->{'rules'}->parse($robot_url, $content, $fresh_until);
|
|
|
|
# recalculate allowed...
|
|
$allowed = $self->{'rules'}->allowed($request->uri);
|
|
}
|
|
|
|
# Check rules
|
|
unless ($allowed) {
|
|
my $res = HTTP::Response->new(
|
|
HTTP::Status::RC_FORBIDDEN, 'Forbidden by robots.txt');
|
|
$res->request( $request ); # bind it to that request
|
|
return $res;
|
|
}
|
|
|
|
my $netloc = eval { local $SIG{__DIE__}; $request->uri->host_port; };
|
|
my $wait = $self->host_wait($netloc);
|
|
|
|
if ($wait) {
|
|
if ($self->{'use_sleep'}) {
|
|
sleep($wait)
|
|
}
|
|
else {
|
|
my $res = HTTP::Response->new(
|
|
HTTP::Status::RC_SERVICE_UNAVAILABLE, 'Please, slow down');
|
|
$res->header('Retry-After', time2str(time + $wait));
|
|
$res->request( $request ); # bind it to that request
|
|
return $res;
|
|
}
|
|
}
|
|
|
|
# Perform the request
|
|
my $res = $self->SUPER::simple_request($request, $arg, $size);
|
|
|
|
$self->{'rules'}->visit($netloc);
|
|
|
|
$res;
|
|
}
|
|
|
|
|
|
sub as_string
|
|
{
|
|
my $self = shift;
|
|
my @s;
|
|
push(@s, "Robot: $self->{'agent'} operated by $self->{'from'} [$self]");
|
|
push(@s, " Minimum delay: " . int($self->{'delay'}*60) . "s");
|
|
push(@s, " Will sleep if too early") if $self->{'use_sleep'};
|
|
push(@s, " Rules = $self->{'rules'}");
|
|
join("\n", @s, '');
|
|
}
|
|
|
|
1;
|
|
|
|
|
|
__END__
|
|
|
|
=pod
|
|
|
|
=head1 NAME
|
|
|
|
LWP::RobotUA - a class for well-behaved Web robots
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
use LWP::RobotUA;
|
|
my $ua = LWP::RobotUA->new('my-robot/0.1', 'me@foo.com');
|
|
$ua->delay(10); # be very nice -- max one hit every ten minutes!
|
|
...
|
|
|
|
# Then just use it just like a normal LWP::UserAgent:
|
|
my $response = $ua->get('http://whatever.int/...');
|
|
...
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
This class implements a user agent that is suitable for robot
|
|
applications. Robots should be nice to the servers they visit. They
|
|
should consult the F</robots.txt> file to ensure that they are welcomed
|
|
and they should not make requests too frequently.
|
|
|
|
But before you consider writing a robot, take a look at
|
|
L<URL:http://www.robotstxt.org/>.
|
|
|
|
When you use an I<LWP::RobotUA> object as your user agent, then you do not
|
|
really have to think about these things yourself; C<robots.txt> files
|
|
are automatically consulted and obeyed, the server isn't queried
|
|
too rapidly, and so on. Just send requests
|
|
as you do when you are using a normal I<LWP::UserAgent>
|
|
object (using C<< $ua->get(...) >>, C<< $ua->head(...) >>,
|
|
C<< $ua->request(...) >>, etc.), and this
|
|
special agent will make sure you are nice.
|
|
|
|
=head1 METHODS
|
|
|
|
The LWP::RobotUA is a sub-class of L<LWP::UserAgent> and implements the
|
|
same methods. In addition the following methods are provided:
|
|
|
|
=head2 new
|
|
|
|
my $ua = LWP::RobotUA->new( %options )
|
|
my $ua = LWP::RobotUA->new( $agent, $from )
|
|
my $ua = LWP::RobotUA->new( $agent, $from, $rules )
|
|
|
|
The LWP::UserAgent options C<agent> and C<from> are mandatory. The
|
|
options C<delay>, C<use_sleep> and C<rules> initialize attributes
|
|
private to the RobotUA. If C<rules> are not provided, then
|
|
C<WWW::RobotRules> is instantiated providing an internal database of
|
|
F<robots.txt>.
|
|
|
|
It is also possible to just pass the value of C<agent>, C<from> and
|
|
optionally C<rules> as plain positional arguments.
|
|
|
|
=head2 delay
|
|
|
|
my $delay = $ua->delay;
|
|
$ua->delay( $minutes );
|
|
|
|
Get/set the minimum delay between requests to the same server, in
|
|
I<minutes>. The default is C<1> minute. Note that this number doesn't
|
|
have to be an integer; for example, this sets the delay to C<10> seconds:
|
|
|
|
$ua->delay(10/60);
|
|
|
|
=head2 use_sleep
|
|
|
|
my $bool = $ua->use_sleep;
|
|
$ua->use_sleep( $boolean );
|
|
|
|
Get/set a value indicating whether the UA should L<LWP::RobotUA/sleep> if
|
|
requests arrive too fast, defined as C<< $ua->delay >> minutes not passed since
|
|
last request to the given server. The default is true. If this value is
|
|
false then an internal C<SERVICE_UNAVAILABLE> response will be generated.
|
|
It will have a C<Retry-After> header that indicates when it is OK to
|
|
send another request to this server.
|
|
|
|
=head2 rules
|
|
|
|
my $rules = $ua->rules;
|
|
$ua->rules( $rules );
|
|
|
|
Set/get which I<WWW::RobotRules> object to use.
|
|
|
|
=head2 no_visits
|
|
|
|
my $num = $ua->no_visits( $netloc )
|
|
|
|
Returns the number of documents fetched from this server host. Yeah I
|
|
know, this method should probably have been named C<num_visits> or
|
|
something like that. :-(
|
|
|
|
=head2 host_wait
|
|
|
|
my $num = $ua->host_wait( $netloc )
|
|
|
|
Returns the number of I<seconds> (from now) you must wait before you can
|
|
make a new request to this host.
|
|
|
|
=head2 as_string
|
|
|
|
my $string = $ua->as_string;
|
|
|
|
Returns a string that describes the state of the UA.
|
|
Mainly useful for debugging.
|
|
|
|
=head1 SEE ALSO
|
|
|
|
L<LWP::UserAgent>, L<WWW::RobotRules>
|
|
|
|
=head1 COPYRIGHT
|
|
|
|
Copyright 1996-2004 Gisle Aas.
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the same terms as Perl itself.
|
|
|
|
=cut
|