version 1.21, 2002/10/08 15:09:36
|
version 1.33, 2003/06/19 19:34:27
|
Line 26
|
Line 26
|
# |
# |
# http://www.lon-capa.org/ |
# http://www.lon-capa.org/ |
# |
# |
# YEAR=2001 |
|
# 04/14/2001, 04/16/2001 Scott Harrison |
|
# |
|
# YEAR=2002 |
|
# 05/11/2002 Scott Harrison |
|
# |
|
### |
### |
|
|
# This script goes through a LON-CAPA resource |
=pod |
# directory and gathers metadata. |
|
# The metadata is entered into a SQL database. |
=head1 NAME |
|
|
|
B<searchcat.pl> - put authoritative filesystem data into sql database. |
|
|
|
=head1 SYNOPSIS |
|
|
|
Ordinarily this script is to be called from a loncapa cron job |
|
(CVS source location: F<loncapa/loncom/cron/loncapa>; typical |
|
filesystem installation location: F</etc/cron.d/loncapa>). |
|
|
|
Here is the cron job entry. |
|
|
|
C<# Repopulate and refresh the metadata database used for the search catalog.> |
|
C<10 1 * * 7 www /home/httpd/perl/searchcat.pl> |
|
|
|
This script only allows itself to be run as the user C<www>. |
|
|
|
=head1 DESCRIPTION |
|
|
|
This script goes through a loncapa resource directory and gathers metadata. |
|
The metadata is entered into a SQL database. |
|
|
|
This script also does general database maintenance such as reformatting |
|
the C<loncapa:metadata> table if it is deprecated. |
|
|
|
This script evaluates dynamic metadata from the authors' |
|
F<nohist_resevaldata.db> database file in order to store it in MySQL, as |
|
well as to compress the filesize (add up all "count"-type metadata). |
|
|
|
This script is playing an increasingly important role for a loncapa |
|
library server. The proper operation of this script is critical for a smooth |
|
and correct user experience. |
|
|
|
=cut |
|
|
use lib '/home/httpd/lib/perl/'; |
use lib '/home/httpd/lib/perl/'; |
use LONCAPA::Configuration; |
use LONCAPA::Configuration; |
Line 45 use IO::File;
|
Line 72 use IO::File;
|
use HTML::TokeParser; |
use HTML::TokeParser; |
use DBI; |
use DBI; |
use GDBM_File; |
use GDBM_File; |
|
use POSIX qw(strftime mktime); |
|
|
my @metalist; |
my @metalist; |
|
|
Line 57 sub unescape {
|
Line 85 sub unescape {
|
return $str; |
return $str; |
} |
} |
|
|
|
# -------------------------------------------------------- Escape Special Chars |
|
|
|
sub escape { |
|
my $str=shift; |
|
$str =~ s/(\W)/"%".unpack('H2',$1)/eg; |
|
return $str; |
|
} |
|
|
|
|
# ------------------------------------------- Code to evaluate dynamic metadata |
# ------------------------------------------- Code to evaluate dynamic metadata |
|
|
sub dynamicmeta { |
sub dynamicmeta { |
# |
|
# |
|
# Do nothing for now ... |
|
# |
|
# |
|
return; |
|
# |
|
# ..., but stuff below already works |
|
# |
|
my $url=&declutter(shift); |
my $url=&declutter(shift); |
$url=~s/\.meta$//; |
$url=~s/\.meta$//; |
my %returnhash=(); |
my %returnhash=(); |
my ($adomain,$aauthor)=($url=~/^(\w+)\/(\w+)\//); |
my ($adomain,$aauthor)=($url=~/^(\w+)\/(\w+)\//); |
my $prodir=&propath($adomain,$aauthor); |
my $prodir=&propath($adomain,$aauthor); |
if (tie(%evaldata,'GDBM_File', |
if ((tie(%evaldata,'GDBM_File', |
$prodir.'/nohist_resevaldata.db',&GDBM_READER,0640)) { |
$prodir.'/nohist_resevaldata.db',&GDBM_READER(),0640)) && |
my %sum=(); |
(tie(%newevaldata,'GDBM_File', |
my %cnt=(); |
$prodir.'/nohist_new_resevaldata.db',&GDBM_WRCREAT(),0640))) { |
my %listitems=('count' => 'add', |
my %sum=(); |
'course' => 'add', |
my %cnt=(); |
'avetries' => 'avg', |
my %listitems=('count' => 'add', |
'stdno' => 'add', |
'course' => 'add', |
'difficulty' => 'avg', |
'avetries' => 'avg', |
'clear' => 'avg', |
'stdno' => 'add', |
'technical' => 'avg', |
'difficulty' => 'avg', |
'helpful' => 'avg', |
'clear' => 'avg', |
'correct' => 'avg', |
'technical' => 'avg', |
'depth' => 'avg', |
'helpful' => 'avg', |
'comments' => 'app', |
'correct' => 'avg', |
'usage' => 'cnt' |
'depth' => 'avg', |
); |
'comments' => 'app', |
my $regexp=$url; |
'usage' => 'cnt' |
$regexp=~s/(\W)/\\$1/g; |
); |
$regexp='___'.$regexp.'___([a-z]+)$'; |
my $regexp=$url; |
foreach (keys %evaldata) { |
$regexp=~s/(\W)/\\$1/g; |
my $key=&unescape($_); |
$regexp='___'.$regexp.'___([a-z]+)$'; |
if ($key=~/$regexp/) { |
foreach (keys %evaldata) { |
if (defined($cnt{$1})) { $cnt{$1}++; } else { $cnt{$1}=1; } |
my $key=&unescape($_); |
unless ($listitems{$1} eq 'app') { |
if ($key=~/$regexp/) { |
if (defined($sum{$1})) { |
my $ctype=$1; |
$sum{$1}+=$evaldata{$_}; |
if (defined($cnt{$ctype})) { |
} else { |
$cnt{$ctype}++; |
$sum{$1}=$evaldata{$_}; |
} else { |
} |
$cnt{$ctype}=1; |
} else { |
} |
if (defined($sum{$1})) { |
unless ($listitems{$ctype} eq 'app') { |
if ($evaldata{$_}) { |
if (defined($sum{$ctype})) { |
$sum{$1}.='<hr>'.$evaldata{$_}; |
$sum{$ctype}+=$evaldata{$_}; |
} |
} else { |
} else { |
$sum{$ctype}=$evaldata{$_}; |
$sum{$1}=''.$evaldata{$_}; |
} |
} |
} else { |
} |
if (defined($sum{$ctype})) { |
} |
if ($evaldata{$_}) { |
foreach (keys %cnt) { |
$sum{$ctype}.='<hr>'.$evaldata{$_}; |
if ($listitems{$_} eq 'avg') { |
} |
$returnhash{$_}=int(($sum{$_}/$cnt{$_})*100.0+0.5)/100.0; |
} else { |
} elsif ($listitems{$_} eq 'cnt') { |
$sum{$ctype}=''.$evaldata{$_}; |
$returnhash{$_}=$cnt{$_}; |
} |
} else { |
} |
$returnhash{$_}=$sum{$_}; |
if ($ctype ne 'count') { |
} |
$newevaldata{$_}=$evaldata{$_}; |
} |
} |
} |
} |
untie(%evaldata); |
} |
} |
foreach (keys %cnt) { |
return %returnhash; |
if ($listitems{$_} eq 'avg') { |
|
$returnhash{$_}=int(($sum{$_}/$cnt{$_})*100.0+0.5)/100.0; |
|
} elsif ($listitems{$_} eq 'cnt') { |
|
$returnhash{$_}=$cnt{$_}; |
|
} else { |
|
$returnhash{$_}=$sum{$_}; |
|
} |
|
} |
|
if ($returnhash{'count'}) { |
|
my $newkey=$$.'_'.time.'_searchcat___'.&escape($url).'___count'; |
|
$newevaldata{$newkey}=$returnhash{'count'}; |
|
} |
|
untie(%evaldata); |
|
untie(%newevaldata); |
|
} |
|
return %returnhash; |
} |
} |
|
|
# ----------------- Code to enable 'find' subroutine listing of the .meta files |
# ----------------- Code to enable 'find' subroutine listing of the .meta files |
require "find.pl"; |
require "find.pl"; |
sub wanted { |
sub wanted { |
(($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) && |
(($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) && |
-f _ && |
-f _ && |
/^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ && |
/^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ && |
push(@metalist,"$dir/$_"); |
push(@metalist,"$dir/$_"); |
} |
} |
|
|
# --------------- Read loncapa_apache.conf and loncapa.conf and get variables |
# --------------- Read loncapa_apache.conf and loncapa.conf and get variables |
Line 148 delete $perlvar{'lonReceipt'}; # remove
|
Line 191 delete $perlvar{'lonReceipt'}; # remove
|
# ------------------------------------- Only run if machine is a library server |
# ------------------------------------- Only run if machine is a library server |
exit unless $perlvar{'lonRole'} eq 'library'; |
exit unless $perlvar{'lonRole'} eq 'library'; |
|
|
|
# ----------------------------- Make sure this process is running from user=www |
|
|
|
my $wwwid=getpwnam('www'); |
|
if ($wwwid!=$<) { |
|
$emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}"; |
|
$subj="LON: $perlvar{'lonHostID'} User ID mismatch"; |
|
system("echo 'User ID mismatch. searchcat.pl must be run as user www.' |\ |
|
mailto $emailto -s '$subj' > /dev/null"); |
|
exit 1; |
|
} |
|
|
|
|
|
# ---------------------------------------------------------- We are in business |
|
|
|
open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log'); |
|
print LOG '==== Searchcat Run '.localtime()."====\n\n"; |
my $dbh; |
my $dbh; |
# ------------------------------------- Make sure that database can be accessed |
# ------------------------------------- Make sure that database can be accessed |
{ |
{ |
unless ( |
unless ( |
$dbh = DBI->connect("DBI:mysql:loncapa","www",$perlvar{'lonSqlAccess'},{ RaiseError =>0,PrintError=>0}) |
$dbh = DBI->connect("DBI:mysql:loncapa","www",$perlvar{'lonSqlAccess'},{ RaiseError =>0,PrintError=>0}) |
) { |
) { |
print "Cannot connect to database!\n"; |
print LOG "Cannot connect to database!\n"; |
exit; |
exit; |
} |
} |
my $make_metadata_table = "CREATE TABLE IF NOT EXISTS metadata (". |
my $make_metadata_table = "CREATE TABLE IF NOT EXISTS metadata (". |
Line 174 my $dbh;
|
Line 233 my $dbh;
|
|
|
# ------------------------------------------------------------- get .meta files |
# ------------------------------------------------------------- get .meta files |
opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}"); |
opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}"); |
my @homeusers=grep |
my @homeusers = grep { |
{&ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_")} |
&ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_") |
grep {!/^\.\.?$/} readdir(RESOURCES); |
} grep {!/^\.\.?$/} readdir(RESOURCES); |
closedir RESOURCES; |
closedir RESOURCES; |
foreach my $user (@homeusers) { |
foreach my $user (@homeusers) { |
|
print LOG "\n=== User: ".$user."\n\n"; |
|
# Remove left-over db-files from potentially crashed searchcat run |
|
my $prodir=&propath($perlvar{'lonDefDomain'},$user); |
|
unlink($prodir.'/nohist_new_resevaldata.db'); |
|
# Use find.pl |
|
undef @metalist; |
|
@metalist=(); |
&find("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$user"); |
&find("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$user"); |
} |
# -- process each file to get metadata and put into search catalog SQL |
|
# database. Also, check to see if already there. |
# -- process each file to get metadata and put into search catalog SQL database |
# I could just delete (without searching first), but this works for now. |
# Also, check to see if already there. |
foreach my $m (@metalist) { |
# I could just delete (without searching first), but this works for now. |
print LOG "- ".$m."\n"; |
foreach my $m (@metalist) { |
my $ref=&metadata($m); |
my $ref=&metadata($m); |
my $m2='/res/'.&declutter($m); |
my $m2='/res/'.&declutter($m); |
$m2=~s/\.meta$//; |
$m2=~s/\.meta$//; |
&dynamicmeta($m2); |
&dynamicmeta($m2); |
my $q2="select * from metadata where url like binary '$m2'"; |
my $q2="select * from metadata where url like binary '$m2'"; |
my $sth = $dbh->prepare($q2); |
my $sth = $dbh->prepare($q2); |
$sth->execute(); |
$sth->execute(); |
my $r1=$sth->fetchall_arrayref; |
my $r1=$sth->fetchall_arrayref; |
if (@$r1) { |
if (@$r1) { |
$sth=$dbh->prepare("delete from metadata where url like binary '$m2'"); |
$sth=$dbh->prepare("delete from metadata where url like binary '$m2'"); |
$sth->execute(); |
|
} |
|
$sth=$dbh->prepare('insert into metadata values ('. |
|
'"'.delete($ref->{'title'}).'"'.','. |
|
'"'.delete($ref->{'author'}).'"'.','. |
|
'"'.delete($ref->{'subject'}).'"'.','. |
|
'"'.$m2.'"'.','. |
|
'"'.delete($ref->{'keywords'}).'"'.','. |
|
'"'.'current'.'"'.','. |
|
'"'.delete($ref->{'notes'}).'"'.','. |
|
'"'.delete($ref->{'abstract'}).'"'.','. |
|
'"'.delete($ref->{'mime'}).'"'.','. |
|
'"'.delete($ref->{'language'}).'"'.','. |
|
'"'.sqltime(delete($ref->{'creationdate'})).'"'.','. |
|
'"'.sqltime(delete($ref->{'lastrevisiondate'})).'"'.','. |
|
'"'.delete($ref->{'owner'}).'"'.','. |
|
'"'.delete($ref->{'copyright'}).'"'.')'); |
$sth->execute(); |
$sth->execute(); |
} |
} |
$sth=$dbh->prepare('insert into metadata values ('. |
|
'"'.delete($ref->{'title'}).'"'.','. |
# --------------------------------------------------- Clean up database |
'"'.delete($ref->{'author'}).'"'.','. |
# Need to, perhaps, remove stale SQL database records. |
'"'.delete($ref->{'subject'}).'"'.','. |
# ... not yet implemented |
'"'.$m2.'"'.','. |
|
'"'.delete($ref->{'keywords'}).'"'.','. |
# ------------------------------------------- Copy over the new db-files |
'"'.'current'.'"'.','. |
system('mv '.$prodir.'/nohist_new_resevaldata.db '. |
'"'.delete($ref->{'notes'}).'"'.','. |
$prodir.'/nohist_resevaldata.db'); |
'"'.delete($ref->{'abstract'}).'"'.','. |
} |
'"'.delete($ref->{'mime'}).'"'.','. |
|
'"'.delete($ref->{'language'}).'"'.','. |
|
'"'.sqltime(delete($ref->{'creationdate'})).'"'.','. |
|
'"'.sqltime(delete($ref->{'lastrevisiondate'})).'"'.','. |
|
'"'.delete($ref->{'owner'}).'"'.','. |
|
'"'.delete($ref->{'copyright'}).'"'.')'); |
|
$sth->execute(); |
|
} |
|
|
|
# ----------------------------------------------------------- Clean up database |
|
# Need to, perhaps, remove stale SQL database records. |
|
# ... not yet implemented |
|
|
|
# --------------------------------------------------- Close database connection |
# --------------------------------------------------- Close database connection |
$dbh->disconnect; |
$dbh->disconnect; |
|
print LOG "\n==== Searchcat completed ".localtime()." ====\n"; |
|
close(LOG); |
|
exit 0; |
|
|
|
|
|
|
|
# ============================================================================= |
|
|
# ---------------------------------------------------------------- Get metadata |
# ---------------------------------------------------------------- Get metadata |
# significantly altered from subroutine present in lonnet |
# significantly altered from subroutine present in lonnet |
Line 238 sub metadata {
|
Line 314 sub metadata {
|
my $parser=HTML::TokeParser->new(\$metastring); |
my $parser=HTML::TokeParser->new(\$metastring); |
my $token; |
my $token; |
while ($token=$parser->get_token) { |
while ($token=$parser->get_token) { |
if ($token->[0] eq 'S') { |
if ($token->[0] eq 'S') { |
my $entry=$token->[1]; |
my $entry=$token->[1]; |
my $unikey=$entry; |
my $unikey=$entry; |
if (defined($token->[2]->{'part'})) { |
if (defined($token->[2]->{'part'})) { |
$unikey.='_'.$token->[2]->{'part'}; |
$unikey.='_'.$token->[2]->{'part'}; |
} |
} |
if (defined($token->[2]->{'name'})) { |
if (defined($token->[2]->{'name'})) { |
$unikey.='_'.$token->[2]->{'name'}; |
$unikey.='_'.$token->[2]->{'name'}; |
} |
} |
if ($metacache{$uri.'keys'}) { |
if ($metacache{$uri.'keys'}) { |
$metacache{$uri.'keys'}.=','.$unikey; |
$metacache{$uri.'keys'}.=','.$unikey; |
} else { |
} else { |
$metacache{$uri.'keys'}=$unikey; |
$metacache{$uri.'keys'}=$unikey; |
} |
} |
map { |
map { |
$metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_}; |
$metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_}; |
} @{$token->[3]}; |
} @{$token->[3]}; |
unless ( |
unless ( |
$metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry) |
$metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry) |
) { $metacache{$uri.''.$unikey}= |
) { $metacache{$uri.''.$unikey}= |
$metacache{$uri.''.$unikey.'.default'}; |
$metacache{$uri.''.$unikey.'.default'}; |
} |
} |
} |
} |
} |
} |
} |
} |
return \%metacache; |
return \%metacache; |
} |
} |
Line 269 sub metadata {
|
Line 345 sub metadata {
|
# ------------------------------------------------------------ Serves up a file |
# ------------------------------------------------------------ Serves up a file |
# returns either the contents of the file or a -1 |
# returns either the contents of the file or a -1 |
sub getfile { |
sub getfile { |
my $file=shift; |
my $file=shift; |
if (! -e $file ) { return -1; }; |
if (! -e $file ) { return -1; }; |
my $fh=IO::File->new($file); |
my $fh=IO::File->new($file); |
my $a=''; |
my $a=''; |
while (<$fh>) { $a .=$_; } |
while (<$fh>) { $a .=$_; } |
return $a |
return $a; |
} |
} |
|
|
# ------------------------------------------------------------- Declutters URLs |
# ------------------------------------------------------------- Declutters URLs |
Line 315 sub propath {
|
Line 391 sub propath {
|
# ---------------------------- convert 'time' format into a datetime sql format |
# ---------------------------- convert 'time' format into a datetime sql format |
sub sqltime { |
sub sqltime { |
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = |
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = |
localtime(@_[0]); |
localtime(&unsqltime(@_[0])); |
$mon++; $year+=1900; |
$mon++; $year+=1900; |
return "$year-$mon-$mday $hour:$min:$sec"; |
return "$year-$mon-$mday $hour:$min:$sec"; |
} |
} |
|
|
|
sub maketime { |
|
my %th=@_; |
|
return POSIX::mktime(($th{'seconds'},$th{'minutes'},$th{'hours'}, |
|
$th{'day'},$th{'month'}-1, |
|
$th{'year'}-1900,0,0,$th{'dlsav'})); |
|
} |
|
|
|
|
|
######################################### |
|
# |
|
# Retro-fixing of un-backward-compatible time format |
|
|
|
sub unsqltime { |
|
my $timestamp=shift; |
|
if ($timestamp=~/^(\d+)\-(\d+)\-(\d+)\s+(\d+)\:(\d+)\:(\d+)$/) { |
|
$timestamp=&maketime('year'=>$1,'month'=>$2,'day'=>$3, |
|
'hours'=>$4,'minutes'=>$5,'seconds'=>$6); |
|
} |
|
return $timestamp; |
|
} |
|
|