Changeset 446
- Timestamp:
- 2007-06-01 16:28:38 (1 year ago)
- Files:
-
- trunk/celestial-2/bin/celestial (modified) (8 diffs)
- trunk/celestial-2/lib/Celestial/DBI.pm (modified) (3 diffs)
- trunk/celestial-2/lib/Celestial/FullText.pm (modified) (9 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/celestial-2/bin/celestial
r445 r446 369 369 my $startTime = $dbh->now(); 370 370 $ftt->synchronize($oai_dc); # Remove orphaned records 371 my $ st= harvest_Fulltext( $dbh,371 my $ok = harvest_Fulltext( $dbh, 372 372 harvestAgent => $ha, 373 373 repository => $repo, … … 375 375 from => $ftt->lastHarvest 376 376 ); 377 if( $ st) {377 if( $ok ) { 378 378 $ftt->lastHarvest( $startTime ); 379 379 } … … 446 446 } 447 447 my $rec = $r->next || return (undef,undef,undef); 448 printf("%s => %s\n", $rec->identifier, $rec->datestamp) if $opt_verbose > 1; 448 449 $mdf->addRecord($rec); 449 450 … … 464 465 } 465 466 $rec = $rec->next || next; 466 if( $mdf->addRecord($rec) ) { 467 # print STDERR "(" . $repo->identifier . "/" . $mdf->metadataPrefix .") GetRecord: Added " . $rec->identifier . "\n"; 468 } 467 printf("%s => %s\n", $rec->identifier, $rec->datestamp) if $opt_verbose > 1; 468 $mdf->addRecord($rec); 469 469 # Add a small delay so we don't overrun the repository 470 470 select(undef,undef,undef,0.25); … … 541 541 my( $ha, $repo, $from, $identifier ) = @args{qw( harvestAgent repository from identifier )}; 542 542 543 my $mdf = $repo->getMetadataFormat( 'oai_dc' ) or return; 544 543 545 my $mets; 544 546 for($repo->listMetadataFormats()) … … 551 553 } 552 554 553 my $mdf = $repo->getMetadataFormat( 'oai_dc' ) or return;554 555 555 my $ftt = $mdf->getFulltext(); 556 556 557 # Full text object requires a record to initialize, so this is delayed 558 my $ft; 559 560 $Celestial::FullText::MAX_FILE_SIZE = $MAX_FILE_SIZE; 561 $Celestial::FullText::DEBUG = $opt_verbose; 562 557 563 my $sth; 558 564 if( defined($identifier) ) { … … 568 574 $sth->execute or die $!; 569 575 } 570 my $st; 576 571 577 while( my ($id) = $sth->fetchrow_array ) 572 578 { 573 579 # Skip if the record already has some full-texts recorded 574 if( $opt_missing and $ftt->hasFulltext( $id ) ) 575 { 576 next; 577 } 580 next if $opt_missing and $ftt->hasFulltext( $id ); 578 581 582 # Remove existing formats for this record 579 583 $ftt->removeFulltext( $id ); 580 584 581 my $rec = $mets ? 582 getMetsRecord( $mets, $id ) : 583 getDcRecord( $mdf, $id ); 584 585 my $rec = getDcRecord( $mdf, $id ) or next; 586 my $mets_record = $mets ? getMetsRecord( $mets, $id ) : undef; 587 585 588 warn sprintf("No METS record for %s\n", $mdf->getRecordIdentifier( $id )) 586 if $mets and not $rec; 587 588 $rec ||= getDcRecord( $mdf, $id ); 589 590 next unless $rec; 589 if $mets and not $mets_record; 591 590 592 591 warn sprintf("Searching for full-text for %s\n", $rec->identifier) 593 592 if $opt_verbose; 594 593 595 $Celestial::FullText::MAX_FILE_SIZE = $MAX_FILE_SIZE; 596 $Celestial::FullText::DEBUG = $opt_verbose; 597 my $ft = Celestial::FullText->new( 598 $ha, 599 server_type => $st, 600 record => $rec 601 ); 602 unless( $st ) 594 # We need to initialize the fulltext object 595 if( !$ft ) 603 596 { 604 $st = $ft->server_type; 605 unless( $st ) { 606 warn "Can't harvest full-text from ".$repo->identifier.": repository type unsupported\n" if $opt_verbose; 597 $ft = Celestial::FullText->new( 598 $ha, 599 $rec 600 ); 601 602 unless( $ft ) 603 { 604 warn "Can't harvest full-text from ".$repo->identifier.": repository type unknown or unsupported\n" if $opt_verbose; 607 605 last; 608 606 } 609 607 } 610 608 611 foreach my $fmt ($ft->formats) 609 my @formats = $ft->formats( $mets_record || $rec ); 610 611 foreach my $fmt ( @formats ) 612 612 { 613 613 my $ds = $rec->datestamp; … … 645 645 } 646 646 $sth->finish; 647 return $st; 647 648 return $ft; 648 649 } 649 650 trunk/celestial-2/lib/Celestial/DBI.pm
r444 r446 73 73 use Carp; 74 74 75 use vars qw($AUTOLOAD $errstr $DB_MAX_ERROR_SIZE $DATE_FORMAT); 76 77 $DB_MAX_ERROR_SIZE = 2**15; # 32k 78 79 $DATE_FORMAT = '%Y%m%d%H%i%S'; 75 use vars qw($AUTOLOAD $errstr ); 76 77 our $DB_MAX_ERROR_SIZE = 2**15; # 32k 78 our $DB_MAX_FIELD_SIZE = 1 * 1024 * 1024; # 1MB of XML ... 79 80 our $DATE_FORMAT = '%Y%m%d%H%i%S'; 80 81 81 82 =pod … … 831 832 my( $self, $mdf, $rec ) = @_; 832 833 $self->addProvenance($mdf, $rec); 833 $self->updateRecord($mdf, $rec);834 return $self->updateRecord($mdf, $rec); 834 835 } 835 836 … … 884 885 } 885 886 887 if( 888 length($hd) > $DB_MAX_FIELD_SIZE or 889 length($md) > $DB_MAX_FIELD_SIZE or 890 length($ab) > $DB_MAX_FIELD_SIZE 891 ) 892 { 893 warn $repo->id . " " . $rec->identifier . " is larger than max allowed size ($DB_MAX_FIELD_SIZE)\n"; 894 return undef; 895 } 896 886 897 my( $id, $accession ); 887 898 eval { trunk/celestial-2/lib/Celestial/FullText.pm
r444 r446 1 =head1 NAME 2 3 Celestial::FullText - Retrieve the full-text for an OAI record 4 5 =head1 SYNOPSIS 6 7 use Celestial::FullText; 8 9 my $ft = Celestial::FullText->new( $ha, identifier => $id ); 10 my $ft = Celestial::FullText->new( $ha, record => $rec ); 11 12 my $ft = Celestial::FullText->new( 13 $ha, 14 record => $rec 15 ); 16 17 unless( $ft ) 18 { 19 warn "Unsupported repository type"; 20 } 21 22 foreach my $file ($ft->formats) 23 { 24 my $mt = $file->mime_type; 25 my @ext = $mt->extensions; 26 my $url = $file->url; 27 print -s $file; 28 } 29 30 =head1 DESCRIPTION 31 32 =head2 EXPORT 33 34 None by default. 35 36 =head1 METHODS 37 38 =over 4 39 40 =cut 41 1 42 package Celestial::FullText; 2 43 … … 23 64 # Preloaded methods go here. 24 65 66 =item $ft = Celestial::FullText->new HARVEST_AGENT, RECORD 67 68 Create a new FullText retrieval object using RECORD to determine what the type of repository is. 69 70 =cut 71 25 72 sub new 26 73 { 27 my( $class, $ha, %opts ) = @_; 28 unless( $opts{ identifier } || $opts{ record } ) 29 { 30 croak("Requires identifier or record arguments"); 31 } 32 unless( $opts{ record } ) 33 { 34 my $r = $ha->GetRecord( 35 identifier => $opts{ identifier }, 36 metadataPrefix => 'oai_dc', 37 handlers => { metadata => 'HTTP::OAI::Metadata::OAI_DC' } 38 ); 39 $opts{ record } = $r->next; 40 unless( $r->is_success ) 41 { 42 warn "Error getting record ($opts{identifier}): " . $r->message; 43 return; 44 } 45 } 46 unless( $opts{ record } ) 47 { 48 warn("Failed to get oai_dc record for $opts{identifier}"); 49 return; 50 } 51 unless( $opts{ record }->metadata ) 52 { 53 warn("Record doesn't contain metadata (".$opts{record}->identifier."): $opts{record}"); 54 return; 55 } 56 bless {%opts, ha => $ha}, $class; 57 } 58 59 sub formats 60 { 61 my $self = shift; 62 my $st = $self->server_type or return (); 63 64 my $f = "_$st"; 65 no strict "refs"; 66 return $self->$f(); 67 } 68 69 sub server_type 70 { 71 my $self = shift; 72 return $self->{ server_type } if $self->{ server_type }; 73 my $ha = $self->{ ha }; 74 my $rec = $self->{ record }; 75 76 if( $rec->metadata->isa( 'HTTP::OAI::Metadata::METS' )) 77 { 78 return $self->{ server_type } = "mets"; 79 } 74 my( $class, $ha, $rec ) = @_; 75 76 Carp::confess("Required record argument undefined") 77 unless( $rec ); 78 Carp::confess("Record doesn't contain metadata") 79 unless( $rec->metadata ); 80 81 my %self; 82 $self{ server_type } = guess_repository_type( $ha, $rec ) 83 or return undef; 84 85 return bless {%self, ha => $ha}, $class; 86 } 87 88 =item $st = Celestial::FullText::guess_repository_type HARVEST_AGENT, RECORD 89 90 Guess the repository type using HARVEST_AGENT and RECORD. 91 92 =cut 93 94 sub guess_repository_type 95 { 96 my( $ha, $rec ) = @_; 97 98 Carp::confess( "Requires a record containing HTTP::OAI::Metadata::OAI_DC metadata" ) 99 unless( $rec->metadata->isa( 'HTTP::OAI::Metadata::OAI_DC' ) ); 80 100 81 101 my $ids = $rec->metadata->dc->{ identifier }; … … 85 105 if( $uri->path eq '/perl/oai2' ) 86 106 { 87 return $self->{ server_type } ="eprints";107 return "eprints"; 88 108 } 89 109 } … … 112 132 if( $ct =~ /\"metadataFieldLabel\"/ and $ct =~ /\"metadataFieldValue\"/ ) 113 133 { 114 # $self->{ jumpoff } = $url; 115 return $self->{ server_type } = "dspace"; 134 return "dspace"; 116 135 } 117 136 } … … 120 139 } 121 140 141 =item @formats = $ft->formats RECORD 142 143 Return a list of formats for the given record. 144 145 =cut 146 147 sub formats 148 { 149 my( $self, $rec ) = @_; 150 151 my $st = $self->{ server_type }; 152 153 # If we've been handed a METS record, use it 154 if( $rec->metadata->isa( 'HTTP::OAI::Metadata::METS' )) 155 { 156 $st = "mets"; 157 } 158 159 my $f = "_$st"; 160 no strict "refs"; 161 return $self->$f($rec); 162 } 163 122 164 sub _dspace 123 165 { 124 my $self = shift;166 my( $self, $rec ) = @_; 125 167 my $ha = $self->{ ha }; 126 my $rec = $self->{ record };127 168 my @fmts; 128 169 my( $jo_url ) = grep { /^https?:\/\// } @{$rec->metadata->dc->{ identifier }}; … … 162 203 sub _eprints 163 204 { 164 my $self = shift;205 my( $self, $rec ) = @_; 165 206 my $ha = $self->{ ha }; 166 my $rec = $self->{ record };167 207 my @fmts; 168 208 my @urls; … … 186 226 sub _mets 187 227 { 188 my $self = shift;228 my( $self, $rec ) = @_; 189 229 my $ha = $self->{ ha }; 190 my $rec = $self->{ record };191 230 192 231 my $bu = URI->new($ha->baseURL)->canonical; … … 320 359 321 360 1; 361 322 362 __END__ 323 # Below is stub documentation for your module. You'd better edit it! 324 325 =head1 NAME 326 327 Celestial::FullText - Retrieve the full-text for an OAI record 328 329 =head1 SYNOPSIS 330 331 use Celestial::FullText; 332 333 my $ft = Celestial::FullText->new( $ha, identifier => $id ); 334 my $ft = Celestial::FullText->new( $ha, record => $rec ); 335 336 my $ft = Celestial::FullText->new( 337 $ha, 338 server_type => 'dspace', 339 record => $rec 340 ); 341 342 unless( $ft ) 343 { 344 warn "Unsupported repository type"; 345 } 346 347 print $ft->server_type; 348 349 foreach my $file ($ft->formats) 350 { 351 my $mt = $file->mime_type; 352 my @ext = $mt->extensions; 353 my $url = $file->url; 354 print -s $file; 355 } 356 357 =head1 DESCRIPTION 358 359 Stub documentation for Celestial::FullText, created by h2xs. It looks like the 360 author of the extension was negligent enough to leave the stub 361 unedited. 362 363 Blah blah blah. 364 365 =head2 EXPORT 366 367 None by default. 368 369 363 364 =back 370 365 371 366 =head1 SEE ALSO 372 373 Mention other useful documentation such as the documentation of374 related modules or operating system documentation (such as man pages375 in UNIX), or any relevant external documentation such as RFCs or376 standards.377 378 If you have a mailing list set up for your module, mention it here.379 380 If you have a web site set up for your module, mention it here.381 367 382 368 =head1 AUTHOR … … 392 378 at your option, any later version of Perl 5 you may have available. 393 379 394 395 =cut 380 =cut