| 1 |
package GScholar; |
|---|
| 2 |
|
|---|
| 3 |
use strict; |
|---|
| 4 |
use warnings; |
|---|
| 5 |
|
|---|
| 6 |
use WWW::Mechanize::Sleepy; |
|---|
| 7 |
use URI; |
|---|
| 8 |
|
|---|
| 9 |
our $MECH = WWW::Mechanize::Sleepy->new( |
|---|
| 10 |
sleep => '5..15', |
|---|
| 11 |
autocheck => 1, |
|---|
| 12 |
); |
|---|
| 13 |
|
|---|
| 14 |
$MECH->agent_alias( "Linux Mozilla" ); |
|---|
| 15 |
|
|---|
| 16 |
our $SCHOLAR = URI->new( "http://scholar.google.com/scholar" ); |
|---|
| 17 |
|
|---|
| 18 |
sub get_cites |
|---|
| 19 |
{ |
|---|
| 20 |
my( $session, $eprint ) = @_; |
|---|
| 21 |
|
|---|
| 22 |
my $title = $eprint->get_value( "title" ); |
|---|
| 23 |
$title =~ s/^(.{30,}?):\s.*$/$1/; |
|---|
| 24 |
|
|---|
| 25 |
my $creator = (@{$eprint->get_value( "creators_name" )})[0]; |
|---|
| 26 |
$creator = $creator->{family}; |
|---|
| 27 |
|
|---|
| 28 |
my $eprint_link = $eprint->get_url; |
|---|
| 29 |
$eprint_link =~ s/(\d+\/)/(?:archive\/0+)?$1/; |
|---|
| 30 |
|
|---|
| 31 |
my $quri = $SCHOLAR->clone; |
|---|
| 32 |
|
|---|
| 33 |
$quri->query_form( |
|---|
| 34 |
q => "$title author:$creator" |
|---|
| 35 |
); |
|---|
| 36 |
|
|---|
| 37 |
my $cluster_id; |
|---|
| 38 |
|
|---|
| 39 |
print STDERR "GET $quri\n" if $session->{noise} > 1; |
|---|
| 40 |
my $r = $MECH->get( $quri ); |
|---|
| 41 |
die $r->code unless $r->is_success; |
|---|
| 42 |
|
|---|
| 43 |
|
|---|
| 44 |
my $by_url = $MECH->find_link( url_regex => qr/^$eprint_link/ ); |
|---|
| 45 |
|
|---|
| 46 |
my $title_re = substr($title,0,200); |
|---|
| 47 |
$title_re =~ s/[^\w\s]/\.?/g; |
|---|
| 48 |
$title_re =~ s/\s+/(?:\\s|(?:<\\\/?b>))+/g; |
|---|
| 49 |
my $by_title = $MECH->find_link( text_regex => qr/^(?:<b>)?$title_re/i ); |
|---|
| 50 |
for( grep { defined $_ } $by_url, $by_title ) |
|---|
| 51 |
{ |
|---|
| 52 |
my @links = $MECH->links; |
|---|
| 53 |
my $i; |
|---|
| 54 |
for($i = 0; $i < @links; ++$i) |
|---|
| 55 |
{ |
|---|
| 56 |
last if $links[$i]->url eq $_->url; |
|---|
| 57 |
} |
|---|
| 58 |
for(; $i < @links; ++$i) |
|---|
| 59 |
{ |
|---|
| 60 |
if( $links[$i]->text =~ /^all \d+ versions/ ) |
|---|
| 61 |
{ |
|---|
| 62 |
$cluster_id = {$links[$i]->URI->query_form}->{"cluster"}; |
|---|
| 63 |
last; |
|---|
| 64 |
} |
|---|
| 65 |
if( $links[$i]->text =~ /^Cited by \d+/ ) |
|---|
| 66 |
{ |
|---|
| 67 |
$cluster_id = {$links[$i]->URI->query_form}->{"cites"}; |
|---|
| 68 |
last; |
|---|
| 69 |
} |
|---|
| 70 |
if( $links[$i]->text =~ /Web Search/ ) |
|---|
| 71 |
{ |
|---|
| 72 |
last; |
|---|
| 73 |
} |
|---|
| 74 |
} |
|---|
| 75 |
} |
|---|
| 76 |
|
|---|
| 77 |
unless( $cluster_id ) |
|---|
| 78 |
{ |
|---|
| 79 |
my @clusters = $MECH->find_all_links( text_regex => qr/all \d+ versions/ ); |
|---|
| 80 |
for(@clusters) |
|---|
| 81 |
{ |
|---|
| 82 |
my $url = $_->URI; |
|---|
| 83 |
print STDERR "GET $url\n" if $session->{noise} > 1; |
|---|
| 84 |
$MECH->get( $url ); |
|---|
| 85 |
|
|---|
| 86 |
my $by_link = $MECH->find_link( url_regex => qr/^$eprint_link/ ); |
|---|
| 87 |
|
|---|
| 88 |
$MECH->back; |
|---|
| 89 |
|
|---|
| 90 |
if( $by_link ) |
|---|
| 91 |
{ |
|---|
| 92 |
$cluster_id = {$url->query_form}->{cluster}; |
|---|
| 93 |
last; |
|---|
| 94 |
} |
|---|
| 95 |
} |
|---|
| 96 |
} |
|---|
| 97 |
|
|---|
| 98 |
unless( $cluster_id ) |
|---|
| 99 |
{ |
|---|
| 100 |
print STDERR "No match for ".$eprint->get_id."\n" if $session->{noise} > 1; |
|---|
| 101 |
return undef; |
|---|
| 102 |
} |
|---|
| 103 |
|
|---|
| 104 |
my $cites = 0; |
|---|
| 105 |
my $cites_link = $MECH->find_link( |
|---|
| 106 |
text_regex => qr/Cited by \d+/, |
|---|
| 107 |
url_regex => qr/\b$cluster_id\b/ |
|---|
| 108 |
); |
|---|
| 109 |
|
|---|
| 110 |
if( $cites_link ) |
|---|
| 111 |
{ |
|---|
| 112 |
$cites_link->text =~ /(\d+)/; |
|---|
| 113 |
$cites = $1; |
|---|
| 114 |
} |
|---|
| 115 |
|
|---|
| 116 |
return { |
|---|
| 117 |
cluster => $cluster_id, |
|---|
| 118 |
impact => $cites, |
|---|
| 119 |
}; |
|---|
| 120 |
} |
|---|
| 121 |
|
|---|
| 122 |
1; |
|---|