version 1.3, 2003/07/29 14:13:36
|
version 1.9, 2006/05/10 16:28:56
|
Line 12 use strict;
|
Line 12 use strict;
|
use LWP::UserAgent; |
use LWP::UserAgent; |
use Getopt::Std; |
use Getopt::Std; |
use Digest::MD5 qw(md5_hex); |
use Digest::MD5 qw(md5_hex); |
|
use IO::File; |
|
|
|
my $basepath='/home/httpd/cgi-bin/OAI-XMLFile/XMLFile/nsdlexport/data'; |
|
|
my $pub_month; |
my $pub_month; |
my $pub_year; |
my $pub_year; |
Line 25 my $content_regex = 'File Not Found';
|
Line 28 my $content_regex = 'File Not Found';
|
# Configuration |
# Configuration |
|
|
my $debug = 0; |
my $debug = 0; |
my $url = 'http://s10.lite.msu.edu/cgi-bin/metadata_harvest.pl'; |
|
|
# Stats |
|
my %allstats=(); |
|
my %filterstats=(); |
|
my %knockout=(); |
|
my %knockoutlang=(); |
|
|
# The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab |
# The list of servers is from the LON-CAPA CVS repository in /loncapa/loncom/production_hosts.tab |
my @servers = ( 'newscience.westshore.cc.mi.us', 's10.lite.msu.edu', 's12.lite.msu.edu', 'lon-capa.chem.sunysb.edu', 'schubert.tmcc.edu', 'dalton.chem.sfu.ca', 'capa2.phy.ohiou.edu', 'pollux.physics.fsu.edu', 'loncapa.physics.sc.edu', 'loncapa.math.ucf.edu', 'zappa.ags.udel.edu', 'loncapa.gwu.edu'); |
my @servers = ( |
|
'newscience.westshore.edu', |
|
's10.lite.msu.edu', |
|
's12.lite.msu.edu', |
|
'schubert.tmcc.edu', |
|
'dalton.chem.sfu.ca', |
|
'capa2.phy.ohiou.edu', |
|
'pollux.physics.fsu.edu', |
|
'loncapa3.physics.sc.edu', |
|
'zappa.ags.udel.edu', |
|
'loncapa.gwu.edu', |
|
'neptune.physics.ndsu.nodak.edu', |
|
'capa1.uwsp.edu', |
|
'loncapa.Mines.EDU', |
|
'loncapa.chm.nau.edu', |
|
'library1.lon-capa.uiuc.edu', |
|
'lon-capa.bsu.edu', |
|
'psblnx03.bd.psu.edu', |
|
'lon-capa.acadiau.ca', |
|
'harvard.lon-capa.org', |
|
'capa1.cc.huji.ac.il', |
|
'lon-capa.phy.cmich.edu', |
|
'meitner.physics.hope.edu', |
|
'loncapa.vcu.edu', |
|
'lon-capa.ucsc.edu', |
|
'lon-capa.bsu.edu', |
|
'harvard.lon-capa.org' |
|
); |
|
|
|
foreach (@servers) { |
|
my $url='http://'.$_.'/cgi-bin/metadata_harvest.pl'; |
# End Configuration |
# End Configuration |
|
|
my $ua = new LWP::UserAgent; |
my $ua = new LWP::UserAgent; |
Line 40 $request->authorization_basic('reaper',
|
Line 78 $request->authorization_basic('reaper',
|
my $response = $ua->request( $request ); |
my $response = $ua->request( $request ); |
|
|
if ( $response->is_success ) { |
if ( $response->is_success ) { |
|
print 'SUCCESS: ' . $response->message.' for '.$url."\n\n"; |
$content = $response->content; |
$content = $response->content; |
# Delete all blank lines |
# Delete all blank lines |
$content =~ s/(?<!.)\n//g; |
$content =~ s/(?<!.)\n//g; |
Line 48 if ( $response->is_success ) {
|
Line 87 if ( $response->is_success ) {
|
# Push the content into an array |
# Push the content into an array |
@loncapa = split /\n/, $content; |
@loncapa = split /\n/, $content; |
} else { |
} else { |
die 'LON-CAPA request failed: ' . $response->message; |
print 'LON-CAPA request failed: ' . $response->message.' for '.$url."\n\n"; |
|
next; |
} |
} |
|
|
#@loncapa=undef; |
#@loncapa=undef; |
Line 60 if ( $response->is_success ) {
|
Line 100 if ( $response->is_success ) {
|
#} |
#} |
|
|
my %records = ();; |
my %records = ();; |
print '<?xml version="1.0" encoding="UTF-8"?>'."\n\n"; |
|
|
my %stats=(); |
|
|
foreach my $metadata (@loncapa) { |
foreach my $metadata (@loncapa) { |
chomp $metadata; |
chomp $metadata; |
$metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs; |
$metadata=~s/[^\w\d\s\.\;\:\,\|\/]/ /gs; |
my @tkline = split('\|', $metadata); |
my @tkline = split('\|', $metadata); |
my $title = $tkline[0]; |
my ($rawtype)=($tkline[3]=~/\.(\w+)$/); |
next if ( $title eq '' ); |
$rawtype=~tr/A-Z/a-z/; |
|
$allstats{$rawtype}++; |
|
|
|
my $title = $tkline[0]; |
|
if ( $title eq '' ) { $knockout{'no_title_'.$rawtype}++; next; } |
my $author = $tkline[1]; |
my $author = $tkline[1]; |
next if ( $author eq '' ); |
if ( $author eq '' ) { $knockout{'no_author_'.$rawtype}++; next; } |
my @authorname = split(' ', $author); |
my @authorname = split(' ', $author); |
my $author_fname = $authorname[0]; |
my $author_fname = $authorname[0]; |
my $author_lname = $authorname[1]; |
my $author_lname = $authorname[1]; |
Line 84 foreach my $metadata (@loncapa) {
|
Line 129 foreach my $metadata (@loncapa) {
|
next if ( ($subject eq 'Sample') || ($subject eq 'Something') ); |
next if ( ($subject eq 'Sample') || ($subject eq 'Something') ); |
my $resourceurl = 'http://nsdl.lon-capa.org' . $tkline[3]; |
my $resourceurl = 'http://nsdl.lon-capa.org' . $tkline[3]; |
my $baseid=$tkline[3]; |
my $baseid=$tkline[3]; |
|
my ($adom,$auname)=($baseid=~/^\/res\/(\w+)\/(\w+)\//); |
$baseid=~s/\W/\_/g; |
$baseid=~s/\W/\_/g; |
$baseid=~s/^\_res\_//g; |
$baseid=~s/^\_res\_//g; |
|
my $fileid=md5_hex($baseid); |
|
|
next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ ); |
next if ( $resourceurl =~ /(.*)\/demo\/(.*)/ ); |
|
# too many fragments out there |
|
next unless ($resourceurl=~/\.(html|htm|problem|assess|xhtm|xml|xhtml|gif|jpg|jpeg|png)$/i); |
|
|
my $keywords = $tkline[4]; |
my $keywords = $tkline[4]; |
my $version = $tkline[5]; |
my $version = $tkline[5]; |
my $notes = $tkline[6]; |
my $notes = $tkline[6]; |
my $abstract = $tkline[7]; |
my $abstract = $tkline[7]; |
next if ($abstract eq ''); |
$abstract=~s/ s / /gs; |
my $type = $tkline[8]; |
$abstract=~s/\s+/ /gs; |
|
my $postsubject=$subject; |
|
unless ($postsubject) { |
|
$postsubject=$keywords; |
|
} else { |
|
$postsubject.=' ('.$keywords.')'; |
|
} |
|
unless ($postsubject=~/\w/) { $knockout{'nosubject_'.$rawtype}++; next; } |
|
unless ($abstract) { $knockout{'noabstract_'.$rawtype}++; next; } |
|
my $type = $rawtype; |
|
if ($type=~/htm/) { $type='htm'; } |
|
|
my $learning_resource_type; |
my $learning_resource_type; |
if ( $type eq 'problem' ) { |
if ( $type eq 'problem' ) { |
$learning_resource_type = 114; |
$learning_resource_type = 114; |
Line 126 foreach my $metadata (@loncapa) {
|
Line 187 foreach my $metadata (@loncapa) {
|
$media_format = 0; |
$media_format = 0; |
} |
} |
|
|
my $language = $tkline[9]; # Look only for seniso |
my $language = $tkline[9]; |
next if ( $language ne 'seniso'); |
# likelihood is that the following is true (people would bother if it is not) |
|
if (($language=~/(seniso|notset|English)/) || (!$language)) { $language='seniso'; } |
|
# NSDL only does English |
|
if ( $language ne 'seniso') { $knockout{'lang_'.$rawtype}++; $knockoutlang{$language}++; next; } |
my $primary_language='en-US'; |
my $primary_language='en-US'; |
my $creation_date = $tkline[10]; |
my $creation_date = $tkline[10]; |
my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ ); |
my ($pub_year,$pub_month,$pub_day) = ( $creation_date =~ /^(\d{4}) (\d{2}) (\d{2})\s(\d{2}):(\d{2}):(\d{2})$/ ); |
Line 148 foreach my $metadata (@loncapa) {
|
Line 212 foreach my $metadata (@loncapa) {
|
# Domain means restricted to a particular LON-CAPA domain |
# Domain means restricted to a particular LON-CAPA domain |
# Defaults mean access open to any registered LON-CAPA user |
# Defaults mean access open to any registered LON-CAPA user |
# Private means open only to author of material |
# Private means open only to author of material |
next if ( $copyright eq 'private'); |
unless ($copyright eq 'public') { $knockout{'notpublic_'.$rawtype}++; next; } |
my $platform = "5"; # HTML Browser (not specified but construed from metadata) |
my $platform = "5"; # HTML Browser (not specified but construed from metadata) |
print (<<ENDMETA); |
# |
|
# We actually do this |
|
# |
|
$stats{$type}++; |
|
$filterstats{$type}++; |
|
# |
|
# Create path |
|
# |
|
unless (-e $basepath.'/'.$adom) { mkdir($basepath.'/'.$adom); } |
|
unless (-e $basepath.'/'.$adom.'/'.$auname) { |
|
mkdir($basepath.'/'.$adom.'/'.$auname) || die 'Could not create '.$basepath.'/'.$adom.'/'.$auname; |
|
} |
|
open(XML,'>'.$basepath.'/'.$adom.'/'.$auname.'/'.$baseid.'.xml'); |
|
print XML (<<ENDMETA); |
|
<?xml version="1.0" encoding="UTF-8"?> |
|
|
<oaidc:dc xmlns="http://purl.org/dc/elements/1.1/" |
<oaidc:dc xmlns="http://purl.org/dc/elements/1.1/" |
xmlns:oaidc="http://www.openarchives.org/OAI/2.0/oai_dc/" |
xmlns:oaidc="http://www.openarchives.org/OAI/2.0/oai_dc/" |
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
Line 158 foreach my $metadata (@loncapa) {
|
Line 237 foreach my $metadata (@loncapa) {
|
http://www.openarchives.org/OAI/2.0/oai_dc.xsd" |
http://www.openarchives.org/OAI/2.0/oai_dc.xsd" |
> |
> |
<title>$title</title> |
<title>$title</title> |
<creator>$author_fname $author_lname</creator> |
<creator>$author</creator> |
<identifier>$resourceurl</identifier> |
<identifier>$resourceurl</identifier> |
<subject>$keywords</subject> |
<subject>$postsubject</subject> |
<subject>$subject</subject> |
|
<language>$primary_language</language> |
<language>$primary_language</language> |
<description>$abstract</description> |
<description>$abstract</description> |
<date>$rev_year-$rev_month-$rev_day</date> |
<date>$rev_year-$rev_month-$rev_day</date> |
</oaidc:dc> |
</oaidc:dc> |
|
|
ENDMETA |
ENDMETA |
|
close (XML); |
|
} |
|
foreach my $thistype (sort keys %stats) { |
|
print "\n$thistype: $stats{$thistype}"; |
|
} |
|
print "\n----\n"; |
|
} |
|
print "\nDone.\n"; |
|
foreach my $thistype (sort keys %allstats) { |
|
print "\n$thistype: $allstats{$thistype} ($filterstats{$thistype}) title: $knockout{'no_title_'.$thistype} author: $knockout{'no_author_'.$thistype} lang: $knockout{'lang_'.$thistype} priv: $knockout{'private_'.$thistype} domain: $knockout{'domain_'.$thistype} custom: $knockout{'custom_'.$thistype}"; |
|
} |
|
print "\n----\n"; |
|
foreach my $thislang (sort keys %knockoutlang) { |
|
print "\n>$thislang<: $knockoutlang{$thislang}"; |
} |
} |
|
print "\n"; |