version 1.30, 2003/02/03 17:01:55
|
version 1.32, 2003/03/26 20:15:57
|
Line 26
|
Line 26
|
# |
# |
# http://www.lon-capa.org/ |
# http://www.lon-capa.org/ |
# |
# |
# YEAR=2001 |
|
# 04/14/2001, 04/16/2001 Scott Harrison |
|
# |
|
# YEAR=2002 |
|
# 05/11/2002 Scott Harrison |
|
# |
|
# YEAR=2003 |
|
# Scott Harrison |
|
# |
|
### |
### |
|
|
=pod |
=pod |
|
|
=head1 NAME |
=head1 NAME |
Line 52 filesystem installation location: F</etc
|
Line 42 filesystem installation location: F</etc
|
Here is the cron job entry. |
Here is the cron job entry. |
|
|
C<# Repopulate and refresh the metadata database used for the search catalog.> |
C<# Repopulate and refresh the metadata database used for the search catalog.> |
|
|
C<10 1 * * 7 www /home/httpd/perl/searchcat.pl> |
C<10 1 * * 7 www /home/httpd/perl/searchcat.pl> |
|
|
This script only allows itself to be run as the user C<www>. |
This script only allows itself to be run as the user C<www>. |
Line 65 The metadata is entered into a SQL datab
|
Line 54 The metadata is entered into a SQL datab
|
This script also does general database maintenance such as reformatting |
This script also does general database maintenance such as reformatting |
the C<loncapa:metadata> table if it is deprecated. |
the C<loncapa:metadata> table if it is deprecated. |
|
|
This script also builds dynamic temporal metadata and stores this inside |
This script evaluates dynamic metadata from the authors' |
a F<nohist_resevaldata.db> database file. |
F<nohist_resevaldata.db> database file in order to store it in MySQL, as |
|
well as to compress the filesize (add up all "count"-type metadata). |
|
|
This script is playing an increasingly important role for a loncapa |
This script is playing an increasingly important role for a loncapa |
library server. The proper operation of this script is critical for a smooth |
library server. The proper operation of this script is critical for a smooth |
Line 74 and correct user experience.
|
Line 64 and correct user experience.
|
|
|
=cut |
=cut |
|
|
# ========================================================== Setting things up. |
|
|
|
# ------------------------------------------------------ Use external modules. |
|
|
|
use lib '/home/httpd/lib/perl/'; |
use lib '/home/httpd/lib/perl/'; |
use LONCAPA::Configuration; |
use LONCAPA::Configuration; |
|
|
Line 87 use DBI;
|
Line 73 use DBI;
|
use GDBM_File; |
use GDBM_File; |
use POSIX qw(strftime mktime); |
use POSIX qw(strftime mktime); |
|
|
# ----------------- Code to enable 'find' subroutine listing of the .meta files |
|
use File::Find; |
|
|
|
# List of .meta files (used on a per-user basis). |
|
my @metalist; |
my @metalist; |
|
|
# --------------- Read loncapa_apache.conf and loncapa.conf and get variables. |
|
my $perlvarref = LONCAPA::Configuration::read_conf('loncapa.conf'); |
|
my %perlvar = %{$perlvarref}; |
|
undef($perlvarref); # Remove since sensitive and not needed. |
|
delete($perlvar{'lonReceipt'}); # Remove since sensitive and not needed. |
|
|
|
# ------------------------------------- Only run if machine is a library server |
|
if ($perlvar{'lonRole'} ne 'library') |
|
{ |
|
exit(0); |
|
} |
|
|
|
# ------------------------------ Make sure this process is running as user=www. |
|
my $wwwid = getpwnam('www'); |
|
if ($wwwid != $<) |
|
{ |
|
$emailto = "$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}"; |
|
$subj = "LON: $perlvar{'lonHostID'} User ID mismatch"; |
|
system("echo 'User ID mismatch. searchcat.pl must be run as user www.' | ". |
|
"mailto $emailto -s '$subj' > /dev/null"); |
|
exit(1); |
|
} |
|
|
|
# ------------------------------------------------------ Initialize log output. |
|
open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log'); |
|
print(LOG '==== Searchcat Run '.localtime().' ===='."\n\n"); |
|
|
|
my $dbh; # Database object reference handle. |
|
|
|
# ----------------------------- Verify connection to loncapa:metadata database. |
|
unless ( |
|
$dbh = DBI->connect('DBI:mysql:loncapa','www', |
|
$perlvar{'lonSqlAccess'}, |
|
{ RaiseError => 0,PrintError => 0}) |
|
) |
|
{ |
|
print(LOG '**** ERROR **** Cannot connect to database!'."\n"); |
|
exit(0); |
|
} |
|
|
|
# ------------------------------ Create loncapa:metadata table if non-existent. |
|
my $make_metadata_table = 'CREATE TABLE IF NOT EXISTS metadata ('. |
|
'title TEXT, author TEXT, subject TEXT, url TEXT, keywords TEXT, '. |
|
'version TEXT, notes TEXT, abstract TEXT, mime TEXT, language TEXT, '. |
|
'creationdate DATETIME, lastrevisiondate DATETIME, owner TEXT, '. |
|
'copyright TEXT, utilitysemaphore BOOL, FULLTEXT idx_title (title), '. |
|
'FULLTEXT idx_author (author), FULLTEXT idx_subject (subject), '. |
|
'FULLTEXT idx_url (url), FULLTEXT idx_keywords (keywords), '. |
|
'FULLTEXT idx_version (version), FULLTEXT idx_notes (notes), '. |
|
'FULLTEXT idx_abstract (abstract), FULLTEXT idx_mime (mime), '. |
|
'FULLTEXT idx_language (language), FULLTEXT idx_owner (owner), '. |
|
'FULLTEXT idx_copyright (copyright)) TYPE=MYISAM'; |
|
|
|
$dbh->do($make_metadata_table); # Generate the table. |
|
|
|
# ----------------------------- Verify format of the loncapa:metadata database. |
|
# (delete and recreate database if necessary). |
|
|
|
# Make a positive control for verifying table structure. |
|
my $make_metadata_table_CONTROL = $make_metadata_table; |
|
$make_metadata_table_CONTROL =~ |
|
s/^(CREATE TABLE IF NOT EXISTS) metadata/$1 CONTROL_metadata/; |
|
|
|
$dbh->do('DROP TABLE IF EXISTS CONTROL_metadata'); |
|
$dbh->do($make_metadata_table_CONTROL); |
|
|
|
my $table_description; # selectall reference to the table description. |
|
|
|
my $CONTROL_table_string; # What the table description should look like. |
|
my $table_string; # What the table description does look like. |
|
|
|
# Calculate the CONTROL table description (what it should be). |
|
$table_description = $dbh->selectall_arrayref('describe CONTROL_metadata'); |
|
foreach my $table_row (@{$table_description}) |
|
{ |
|
$CONTROL_table_string .= join(',',@{$table_row})."\n"; |
|
} |
|
|
|
# Calculate the current table description (what it currently looks like). |
|
$table_description = $dbh->selectall_arrayref('describe metadata'); |
|
foreach my $table_row (@{$table_description}) |
|
{ |
|
$table_string .= join(',',@{$table_row})."\n"; |
|
} |
|
|
|
if ($table_string ne $CONTROL_table_string) |
|
{ |
|
# Log this incident. |
|
print(LOG '**** WARNING **** Table structure mismatch, need to regenerate'. |
|
'.'."\n"); |
|
# Delete the table. |
|
$dbh->do('DROP TABLE IF EXISTS metadata'); |
|
# Generate the table. |
|
$dbh->do($make_metadata_table); |
|
} |
|
|
|
$dbh->do('DROP TABLE IF EXISTS CONTROL_metadata'); # Okay. Done with control. |
# ----------------------------------------------------- Un-Escape Special Chars |
|
|
# ----------------------------------------------- Set utilitysemaphore to zero. |
|
$dbh->do('UPDATE metadata SET utilitysemaphore = 0'); |
|
|
|
# ========================================================= Main functionality. |
|
|
|
# - Determine home authors on this server based on resources dir and user tree. |
|
|
|
# RESOURCES: the resources directory (subdirs correspond to author usernames). |
|
opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}") or |
|
(print(LOG '=== /res/--lonDefDomain-- directory is not accessible'."\n") |
|
and exit(0)); |
|
|
|
# query_home_server_status will look for user home directories on this machine. |
|
my @homeusers = |
|
grep {&query_home_server_status($perlvar{'lonDocRoot'}.'/res/'. |
|
$perlvar{'lonDefDomain'}.'/'.$_) |
|
} grep {!/^\.\.?$/} readdir(RESOURCES); |
|
closedir(RESOURCES); |
|
|
|
unless (@homeusers) |
|
{ |
|
print(LOG '=== No home users found on this server.'."\n"); |
|
} |
|
|
|
# Consider each author individually. |
|
foreach my $user (@homeusers) |
|
{ |
|
# Make a log entry. |
|
print(LOG "\n".'=== User: '.$user."\n\n"); |
|
|
|
# Get filesystem path to this user's directory. |
|
my $user_directory = |
|
&construct_path_to_user_directory($perlvar{'lonDefDomain'},$user); |
|
|
|
# Remove left-over db-files from a potentially crashed searchcat run. |
|
unlink($user_directory.'/nohist_new_resevaldata.db'); |
|
|
|
# Cleanup the metalist array. |
|
undef(@metalist); |
|
@metalist = (); |
|
|
|
# This will add entries to the @metalist array. |
|
&File::Find::find(\&wanted, |
|
$perlvar{'lonDocRoot'}.'/res/'. |
|
$perlvar{'lonDefDomain'}.'/'.$user); |
|
|
|
# -- process file to get metadata and put into search catalog SQL database |
|
# Also, build and store dynamic metadata. |
|
# Also, delete record entries before refreshing. |
|
foreach my $m (@metalist) |
|
{ |
|
# Log this action. |
|
print(LOG "- ".$m."\n"); |
|
|
|
# Get metadata from the file. |
|
my $ref = get_metadata_from_file($m); |
|
|
|
# Make a datarecord identifier for this resource. |
|
my $m2 = '/res/'.declutter($m); |
|
$m2 =~ s/\.meta$//; |
|
|
|
# Build and store dynamic metadata inside nohist_resevaldata.db. |
|
build_on_the_fly_dynamic_metadata($m2); |
|
|
|
# Delete record if it already exists. |
|
my $q2 = 'select * from metadata where url like binary '."'".$m2."'"; |
|
my $sth = $dbh->prepare($q2); |
|
$sth->execute(); |
|
my $r1 = $sth->fetchall_arrayref; |
|
if (@$r1) |
|
{ |
|
$sth = |
|
$dbh->prepare('delete from metadata where url like binary '. |
|
"'".$m2."'"); |
|
$sth->execute(); |
|
} |
|
|
|
# Add new/replacement record into the loncapa:metadata table. |
|
$sth = $dbh->prepare('insert into metadata values ('. |
|
'"'.delete($ref->{'title'}).'"'.','. |
|
'"'.delete($ref->{'author'}).'"'.','. |
|
'"'.delete($ref->{'subject'}).'"'.','. |
|
'"'.$m2.'"'.','. |
|
'"'.delete($ref->{'keywords'}).'"'.','. |
|
'"'.'current'.'"'.','. |
|
'"'.delete($ref->{'notes'}).'"'.','. |
|
'"'.delete($ref->{'abstract'}).'"'.','. |
|
'"'.delete($ref->{'mime'}).'"'.','. |
|
'"'.delete($ref->{'language'}).'"'.','. |
|
'"'.sql_formatted_time( |
|
delete($ref->{'creationdate'})).'"'.','. |
|
'"'.sql_formatted_time( |
|
delete($ref->{'lastrevisiondate'})).'"'.','. |
|
'"'.delete($ref->{'owner'}).'"'.','. |
|
'"'.delete($ref->{'copyright'}).'"'.','. |
|
'1'.')'); |
|
$sth->execute(); |
|
} |
|
|
|
# ----------------------- Clean up database, remove stale SQL database records. |
|
$dbh->do('DELETE FROM metadata WHERE utilitysemaphore = 0'); |
|
|
|
# -------------------------------------------------- Copy over the new db-files |
|
system('mv '.$user_directory.'/nohist_new_resevaldata.db '. |
|
$user_directory.'/nohist_resevaldata.db'); |
|
} |
|
|
|
# --------------------------------------------------- Close database connection |
|
$dbh->disconnect; |
|
print LOG "\n==== Searchcat completed ".localtime()." ====\n"; |
|
close(LOG); |
|
exit(0); |
|
|
|
# ================================================================ Subroutines. |
|
|
|
=pod |
|
|
|
=head1 SUBROUTINES |
sub unescape { |
|
my $str=shift; |
=cut |
|
|
|
=pod |
|
|
|
B<unescape> - translate to unstrange escaped syntax to strange characters. |
|
|
|
=over 4 |
|
|
|
Parameters: |
|
|
|
=item I<$str> - string with unweird characters. |
|
|
|
=back |
|
|
|
=over 4 |
|
|
|
Returns: |
|
|
|
=item C<string> - string with potentially weird characters. |
|
|
|
=back |
|
|
|
=cut |
|
|
|
sub unescape ($) |
|
{ |
|
my $str = shift(@_); |
|
$str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg; |
$str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg; |
return($str); |
return $str; |
} |
} |
|
|
=pod |
|
|
|
B<escape> - translate strange characters to unstrange escaped syntax. |
|
|
|
=over 4 |
|
|
|
Parameters: |
|
|
|
=item I<$str> - string with potentially weird characters to unweird-ify. |
|
|
|
=back |
|
|
|
=over 4 |
|
|
|
Returns: |
|
|
|
=item C<string> - unweird-ified string. |
|
|
|
=back |
|
|
|
=cut |
# -------------------------------------------------------- Escape Special Chars |
|
|
sub escape ($) |
sub escape { |
{ |
my $str=shift; |
my $str = shift(@_); |
|
$str =~ s/(\W)/"%".unpack('H2',$1)/eg; |
$str =~ s/(\W)/"%".unpack('H2',$1)/eg; |
return($str); |
return $str; |
} |
} |
|
|
=pod |
|
|
|
B<build_on_the_fly_dynamic_metadata> - evaluate and store dynamic metadata. |
|
|
|
Returns the dynamic metadata for an author, which will later be added to the |
|
MySQL database (not yet implemented). |
|
|
|
The vast majority of entries in F<nohist_resevaldata.db>, which contains |
|
the dynamic metadata for an author's resources, are "count", which make |
|
the file really large and evaluation really slow. |
|
|
|
While computing the current value of all dynamic metadata |
|
for later insertion into the MySQL metadata cache (not yet implemented), |
|
this routine also simply adds up all "count" type fields and replaces them by |
|
one new field with the to-date count. |
|
|
|
Only after successful completion of working with one author, copy new file to |
|
original file. Copy to tmp-"new"-db-file was necessary since db-file size |
|
would not shrink after "delete" of key. |
|
|
|
=over 4 |
|
|
|
Parameters: |
|
|
|
=item I<$url> - the filesystem path (url may be a misnomer...) |
|
|
|
=back |
|
|
|
=over 4 |
|
|
|
Returns: |
|
|
|
=item C<hash> - key-value table of dynamically evaluated metadata. |
|
|
|
=back |
|
|
|
=cut |
# ------------------------------------------- Code to evaluate dynamic metadata |
|
|
sub build_on_the_fly_dynamic_metadata { |
sub dynamicmeta { |
|
|
# Need to compute the user's directory. |
|
my $url=&declutter(shift); |
my $url=&declutter(shift); |
$url=~s/\.meta$//; |
$url=~s/\.meta$//; |
my %returnhash=(); |
my %returnhash=(); |
my ($adomain,$aauthor)=($url=~/^(\w+)\/(\w+)\//); |
my ($adomain,$aauthor)=($url=~/^(\w+)\/(\w+)\//); |
my $user_directory=&construct_path_to_user_directory($adomain,$aauthor); |
my $prodir=&propath($adomain,$aauthor); |
|
|
# Attempt a GDBM database instantiation inside users directory and proceed. |
|
if ((tie(%evaldata,'GDBM_File', |
if ((tie(%evaldata,'GDBM_File', |
$user_directory. |
$prodir.'/nohist_resevaldata.db',&GDBM_READER(),0640)) && |
'/nohist_resevaldata.db',&GDBM_READER(),0640)) && |
|
(tie(%newevaldata,'GDBM_File', |
(tie(%newevaldata,'GDBM_File', |
$user_directory. |
$prodir.'/nohist_new_resevaldata.db',&GDBM_WRCREAT(),0640))) { |
'/nohist_new_resevaldata.db',&GDBM_WRCREAT(),0640))) { |
my %sum=(); |
# For different variables, track the running sum and counts. |
my %cnt=(); |
my %sum=(); |
my %listitems=('count' => 'add', |
my %cnt=(); |
'course' => 'add', |
|
'avetries' => 'avg', |
# Define computed items as a sum (add) or an average (avg) or a raw |
'stdno' => 'add', |
# count (cnt) or append (app)? |
'difficulty' => 'avg', |
my %listitems=('count' => 'add', |
'clear' => 'avg', |
'course' => 'add', |
'technical' => 'avg', |
'avetries' => 'avg', |
'helpful' => 'avg', |
'stdno' => 'add', |
'correct' => 'avg', |
'difficulty' => 'avg', |
'depth' => 'avg', |
'clear' => 'avg', |
'comments' => 'app', |
'technical' => 'avg', |
'usage' => 'cnt' |
'helpful' => 'avg', |
); |
'correct' => 'avg', |
my $regexp=$url; |
'depth' => 'avg', |
$regexp=~s/(\W)/\\$1/g; |
'comments' => 'app', |
$regexp='___'.$regexp.'___([a-z]+)$'; |
'usage' => 'cnt' |
foreach (keys %evaldata) { |
); |
my $key=&unescape($_); |
|
if ($key=~/$regexp/) { |
# Untaint the url and use as part of a regular expression. |
my $ctype=$1; |
my $regexp=$url; |
if (defined($cnt{$ctype})) { |
$regexp=~s/(\W)/\\$1/g; |
$cnt{$ctype}++; |
$regexp='___'.$regexp.'___([a-z]+)$'; #' emacs |
} else { |
|
$cnt{$ctype}=1; |
# Check existing database for this author. |
} |
# this is modifying the 'count' entries |
unless ($listitems{$ctype} eq 'app') { |
# and copying all other entries over |
if (defined($sum{$ctype})) { |
|
$sum{$ctype}+=$evaldata{$_}; |
foreach (keys %evaldata) { |
} else { |
my $key=&unescape($_); |
$sum{$ctype}=$evaldata{$_}; |
if ($key=~/$regexp/) { # If url-based entry exists. |
} |
my $ctype=$1; # Set to specific category type. |
} else { |
|
if (defined($sum{$ctype})) { |
# Do an increment for this category type. |
if ($evaldata{$_}) { |
if (defined($cnt{$ctype})) { |
$sum{$ctype}.='<hr>'.$evaldata{$_}; |
$cnt{$ctype}++; |
} |
} else { |
} else { |
$cnt{$ctype}=1; |
$sum{$ctype}=''.$evaldata{$_}; |
} |
} |
unless ($listitems{$ctype} eq 'app') { # append comments |
|
# Increment the sum based on the evaluated data in the db. |
|
if (defined($sum{$ctype})) { |
|
$sum{$ctype}+=$evaldata{$_}; |
|
} else { |
|
$sum{$ctype}=$evaldata{$_}; |
|
} |
|
} else { # 'app' mode, means to use '<hr />' as a separator |
|
if (defined($sum{$ctype})) { |
|
if ($evaldata{$_}) { |
|
$sum{$ctype}.='<hr />'.$evaldata{$_}; |
|
} |
|
} else { |
|
$sum{$ctype}=''.$evaldata{$_}; |
|
} |
|
} |
|
if ($ctype ne 'count') { |
|
# this is copying all data except 'count' attributes |
|
$newevaldata{$_}=$evaldata{$_}; |
|
} |
|
} |
} |
} |
if ($ctype ne 'count') { |
|
$newevaldata{$_}=$evaldata{$_}; |
# these values will be returned (currently still unused) |
} |
foreach (keys %cnt) { |
} |
if ($listitems{$_} eq 'avg') { |
} |
$returnhash{$_}=int(($sum{$_}/$cnt{$_})*100.0+0.5)/100.0; |
foreach (keys %cnt) { |
} elsif ($listitems{$_} eq 'cnt') { |
if ($listitems{$_} eq 'avg') { |
$returnhash{$_}=$cnt{$_}; |
$returnhash{$_}=int(($sum{$_}/$cnt{$_})*100.0+0.5)/100.0; |
} else { |
} elsif ($listitems{$_} eq 'cnt') { |
$returnhash{$_}=$sum{$_}; |
$returnhash{$_}=$cnt{$_}; |
} |
} else { |
} |
$returnhash{$_}=$sum{$_}; |
|
} |
# generate new count key in resevaldata, insert sum |
} |
if ($returnhash{'count'}) { |
if ($returnhash{'count'}) { |
my $newkey=$$.'_'.time.'_searchcat___'.&escape($url).'___count'; |
my $newkey=$$.'_'.time.'_searchcat___'.&escape($url).'___count'; |
$newevaldata{$newkey}=$returnhash{'count'}; |
$newevaldata{$newkey}=$returnhash{'count'}; |
} |
} |
|
untie(%evaldata); |
untie(%evaldata); # Close/release the original nohist database. |
untie(%newevaldata); |
untie(%newevaldata); # Close/release the new nohist database. |
} |
} |
return %returnhash; |
return %returnhash; |
|
} |
} |
|
|
=pod |
# ----------------- Code to enable 'find' subroutine listing of the .meta files |
|
require "find.pl"; |
B<wanted> - used by B<File::Find::find> subroutine. |
sub wanted { |
|
|
This evaluates whether a file is wanted, and pushes it onto the |
|
I<@metalist> array. This subroutine was, for the most part, auto-generated |
|
by the B<find2perl> command. |
|
|
|
=over 4 |
|
|
|
Parameters: |
|
|
|
=item I<$file> - a path to the file. |
|
|
|
=back |
|
|
|
=over 4 |
|
|
|
Returns: |
|
|
|
=item C<boolean> - true or false based on logical statement. |
|
|
|
=back |
|
|
|
=cut |
|
|
|
sub wanted ($) |
|
{ |
|
(($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) && |
(($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) && |
-f $_ && |
-f _ && |
/^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ && |
/^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ && |
push(@metalist,$File::Find::dir.'/'.$_); |
push(@metalist,"$dir/$_"); |
} |
} |
|
|
=pod |
|
|
|
B<get_metadata_from_file> - read xml-tagged file and return parsed metadata. |
# --------------- Read loncapa_apache.conf and loncapa.conf and get variables |
|
my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf'); |
|
my %perlvar=%{$perlvarref}; |
|
undef $perlvarref; # remove since sensitive and not needed |
|
delete $perlvar{'lonReceipt'}; # remove since sensitive and not needed |
|
|
I<Note that this is significantly altered from a subroutine present in lonnet.> |
# ------------------------------------- Only run if machine is a library server |
|
exit unless $perlvar{'lonRole'} eq 'library'; |
|
|
=over 4 |
# ----------------------------- Make sure this process is running from user=www |
|
|
Parameters: |
my $wwwid=getpwnam('www'); |
|
if ($wwwid!=$<) { |
|
$emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}"; |
|
$subj="LON: $perlvar{'lonHostID'} User ID mismatch"; |
|
system("echo 'User ID mismatch. searchcat.pl must be run as user www.' |\ |
|
mailto $emailto -s '$subj' > /dev/null"); |
|
exit 1; |
|
} |
|
|
=item I<$file> - a path.to the file. |
|
|
|
=back |
# ---------------------------------------------------------- We are in business |
|
|
=over 4 |
open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log'); |
|
print LOG '==== Searchcat Run '.localtime()."====\n\n"; |
|
my $dbh; |
|
# ------------------------------------- Make sure that database can be accessed |
|
{ |
|
unless ( |
|
$dbh = DBI->connect("DBI:mysql:loncapa","www",$perlvar{'lonSqlAccess'},{ RaiseError =>0,PrintError=>0}) |
|
) { |
|
print LOG "Cannot connect to database!\n"; |
|
exit; |
|
} |
|
my $make_metadata_table = "CREATE TABLE IF NOT EXISTS metadata (". |
|
"title TEXT, author TEXT, subject TEXT, url TEXT, keywords TEXT, ". |
|
"version TEXT, notes TEXT, abstract TEXT, mime TEXT, language TEXT, ". |
|
"creationdate DATETIME, lastrevisiondate DATETIME, owner TEXT, ". |
|
"copyright TEXT, FULLTEXT idx_title (title), ". |
|
"FULLTEXT idx_author (author), FULLTEXT idx_subject (subject), ". |
|
"FULLTEXT idx_url (url), FULLTEXT idx_keywords (keywords), ". |
|
"FULLTEXT idx_version (version), FULLTEXT idx_notes (notes), ". |
|
"FULLTEXT idx_abstract (abstract), FULLTEXT idx_mime (mime), ". |
|
"FULLTEXT idx_language (language), FULLTEXT idx_owner (owner), ". |
|
"FULLTEXT idx_copyright (copyright)) TYPE=MYISAM"; |
|
# It would sure be nice to have some logging mechanism. |
|
$dbh->do($make_metadata_table); |
|
} |
|
|
Returns: |
# ------------------------------------------------------------- get .meta files |
|
opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}"); |
|
my @homeusers=grep |
|
{&ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_")} |
|
grep {!/^\.\.?$/} readdir(RESOURCES); |
|
closedir RESOURCES; |
|
foreach my $user (@homeusers) { |
|
print LOG "\n=== User: ".$user."\n\n"; |
|
# Remove left-over db-files from potentially crashed searchcat run |
|
my $prodir=&propath($perlvar{'lonDefDomain'},$user); |
|
unlink($prodir.'/nohist_new_resevaldata.db'); |
|
# Use find.pl |
|
undef @metalist; |
|
@metalist=(); |
|
&find("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$user"); |
|
|
|
# -- process each file to get metadata and put into search catalog SQL database |
|
# Also, check to see if already there. |
|
# I could just delete (without searching first), but this works for now. |
|
foreach my $m (@metalist) { |
|
print LOG "- ".$m."\n"; |
|
my $ref=&metadata($m); |
|
my $m2='/res/'.&declutter($m); |
|
$m2=~s/\.meta$//; |
|
&dynamicmeta($m2); |
|
my $q2="select * from metadata where url like binary '$m2'"; |
|
my $sth = $dbh->prepare($q2); |
|
$sth->execute(); |
|
my $r1=$sth->fetchall_arrayref; |
|
if (@$r1) { |
|
$sth=$dbh->prepare("delete from metadata where url like binary '$m2'"); |
|
$sth->execute(); |
|
} |
|
$sth=$dbh->prepare('insert into metadata values ('. |
|
'"'.delete($ref->{'title'}).'"'.','. |
|
'"'.delete($ref->{'author'}).'"'.','. |
|
'"'.delete($ref->{'subject'}).'"'.','. |
|
'"'.$m2.'"'.','. |
|
'"'.delete($ref->{'keywords'}).'"'.','. |
|
'"'.'current'.'"'.','. |
|
'"'.delete($ref->{'notes'}).'"'.','. |
|
'"'.delete($ref->{'abstract'}).'"'.','. |
|
'"'.delete($ref->{'mime'}).'"'.','. |
|
'"'.delete($ref->{'language'}).'"'.','. |
|
'"'.sqltime(delete($ref->{'creationdate'})).'"'.','. |
|
'"'.sqltime(delete($ref->{'lastrevisiondate'})).'"'.','. |
|
'"'.delete($ref->{'owner'}).'"'.','. |
|
'"'.delete($ref->{'copyright'}).'"'.')'); |
|
$sth->execute(); |
|
} |
|
|
=item C<hash reference> - a hash array (keys and values). |
# ----------------------------------------------------------- Clean up database |
|
# Need to, perhaps, remove stale SQL database records. |
|
# ... not yet implemented |
|
|
=back |
|
|
|
=cut |
# -------------------------------------------------- Copy over the new db-files |
|
system('mv '.$prodir.'/nohist_new_resevaldata.db '. |
|
$prodir.'/nohist_resevaldata.db'); |
|
} |
|
# --------------------------------------------------- Close database connection |
|
$dbh->disconnect; |
|
print LOG "\n==== Searchcat completed ".localtime()." ====\n"; |
|
close(LOG); |
|
exit 0; |
|
# ============================================================================= |
|
|
sub get_metadata_from_file ($) |
# ---------------------------------------------------------------- Get metadata |
{ |
# significantly altered from subroutine present in lonnet |
my ($filename) = @_; |
sub metadata { |
my %metatable; # Used to store return value of hash-tabled metadata. |
my ($uri,$what)=@_; |
$filename = &declutter($filename); # Remove non-identifying filesystem info |
my %metacache; |
my $uri = ''; # The URI is not relevant in this scenario. |
$uri=&declutter($uri); |
unless ($filename =~ m/\.meta$/) # Unless ending with .meta. |
my $filename=$uri; |
{ |
$uri=~s/\.meta$//; |
$filename .= '.meta'; # Append a .meta suffix. |
$uri=''; |
} |
unless ($metacache{$uri.'keys'}) { |
# Get the file contents. |
unless ($filename=~/\.meta$/) { $filename.='.meta'; } |
my $metadata_string = |
my $metastring=&getfile($perlvar{'lonDocRoot'}.'/res/'.$filename); |
&get_file_contents($perlvar{'lonDocRoot'}.'/res/'.$filename); |
my $parser=HTML::TokeParser->new(\$metastring); |
|
my $token; |
# Parse the file based on its XML tags. |
while ($token=$parser->get_token) { |
my $parser = HTML::TokeParser->new(\$metadata_string); |
if ($token->[0] eq 'S') { |
my $token; |
my $entry=$token->[1]; |
while ($token = $parser->get_token) # Loop through tokens. |
my $unikey=$entry; |
{ |
if (defined($token->[2]->{'part'})) { |
if ($token->[0] eq 'S') # If it is a start token. |
$unikey.='_'.$token->[2]->{'part'}; |
{ |
|
my $entry = $token->[1]; |
|
my $unikey = $entry; # A unique identifier for this xml tag key. |
|
if (defined($token->[2]->{'part'})) |
|
{ |
|
$unikey .= '_'.$token->[2]->{'part'}; |
|
} |
|
if (defined($token->[2]->{'name'})) |
|
{ |
|
$unikey .= '_'.$token->[2]->{'name'}; |
|
} |
} |
# Append $unikey to metatable's keys entry. |
if (defined($token->[2]->{'name'})) { |
if ($metatable{$uri.'keys'}) |
$unikey.='_'.$token->[2]->{'name'}; |
{ |
|
$metatable{$uri.'keys'} .= ','.$unikey; |
|
} |
} |
else |
if ($metacache{$uri.'keys'}) { |
{ |
$metacache{$uri.'keys'}.=','.$unikey; |
$metatable{$uri.'keys'} = $unikey; |
} else { |
|
$metacache{$uri.'keys'}=$unikey; |
} |
} |
# Insert contents into metatable entry for the unikey. |
map { |
foreach my $t3 (@{$token->[3]}) |
$metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_}; |
{ |
} @{$token->[3]}; |
$metatable{$uri.''.$unikey.'.'.$_} = $token->[2]->{$t3}; |
unless ( |
} |
$metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry) |
# If there was no text contained inside the tags, set = default. |
) { $metacache{$uri.''.$unikey}= |
unless |
$metacache{$uri.''.$unikey.'.default'}; |
( |
} |
$metatable{$uri.''.$unikey} = $parser->get_text('/'.$entry) |
} |
) |
} |
{ |
} |
$metatable{$uri.''.$unikey} = |
return \%metacache; |
$metatable{$uri.''.$unikey.'.default'}; |
} |
} |
|
} |
|
} |
|
# Return with a key-value table of XML tags and their tag contents. |
|
return(\%metatable); |
|
} |
|
|
|
=pod |
|
|
|
B<get_file_contents> - returns either the contents of the file or a -1. |
|
|
|
=over 4 |
|
|
|
Parameters: |
|
|
|
=item I<$file> - a complete filesystem path.to the file. |
|
|
|
=back |
|
|
|
=over 4 |
|
|
|
Returns: |
|
|
|
=item C<string> - file contents or a -1. |
|
|
|
=back |
|
|
|
=cut |
|
|
|
sub get_file_contents ($) |
|
{ |
|
my $file = shift(@_); |
|
|
|
# If file does not exist, then return a -1 value. |
|
unless (-e $file) |
|
{ |
|
return(-1); |
|
} |
|
|
|
# Read in file contents. |
|
my $file_handle = IO::File->new($file); |
|
my $file_contents = ''; |
|
while (<$file_handle>) |
|
{ |
|
$file_contents .= $_; |
|
} |
|
|
|
# Return file contents. |
|
return($file_contents); |
|
} |
|
|
|
=pod |
|
|
|
B<declutter> - Declutters URLs (remove extraneous prefixed filesystem path). |
|
|
|
=over 4 |
|
|
|
Parameters: |
|
|
|
=item I<$filesystem_path> - a complete filesystem path. |
|
|
|
=back |
|
|
|
=over 4 |
|
|
|
Returns: |
|
|
|
=item C<string> - remnants of the filesystem path (beginning portion removed). |
|
|
|
=back |
|
|
|
=cut |
|
|
|
sub declutter |
|
{ |
|
my $filesystem_path = shift(@_); |
|
|
|
# Remove beginning portions of the filesystem path. |
|
$filesystem_path =~ s/^$perlvar{'lonDocRoot'}//; |
|
$filesystem_path =~ s!^/!!; |
|
$filesystem_path =~ s!^res/!!; |
|
|
|
# Return what is remaining for the filesystem path. |
|
return($filesystem_path); |
|
} |
|
|
|
=pod |
|
|
|
B<query_home_server_status> - Is this the home server of an author's directory? |
|
|
|
=over 4 |
|
|
|
Parameters: |
|
|
|
=item I<$author_filesystem_path> - directory path for a user. |
|
|
|
=back |
|
|
|
=over 4 |
|
|
|
Returns: |
|
|
|
=item C<boolean> - 1 if true; 0 if false. |
|
|
|
=back |
|
|
|
=cut |
|
|
|
sub query_home_server_status ($) |
|
{ |
|
my $author_filesystem_path = shift(@_); |
|
|
|
# Remove beginning portion of this filesystem path. |
|
$author_filesystem_path =~ s!/home/httpd/html/res/([^/]*)/([^/]*).*!$1/$2!; |
|
|
|
# Construct path to the author's ordinary user directory. |
|
my ($user_domain,$username) = split(m!/!,$author_filesystem_path); |
|
my $user_directory_path = construct_path_to_user_directory($user_domain, |
|
$username); |
|
|
|
# Return status of whether the user directory path is defined. |
|
if (-e $user_directory_path) |
|
{ |
|
return(1); # True. |
|
} |
|
else |
|
{ |
|
return(0); # False. |
|
} |
|
} |
|
|
|
=pod |
|
|
|
B<construct_path_to_user_directory> ($$) - makes a filesystem path to user dir. |
|
|
|
=over 4 |
|
|
|
Parameters: |
|
|
|
=item I<$user_domain> - the loncapa domain of the user. |
|
|
|
=item I<$username> - the unique username (user id) of the user. |
|
|
|
=back |
|
|
|
=over 4 |
|
|
|
Returns: |
|
|
|
=item C<string> - representing the path on the filesystem. |
|
|
|
=back |
|
|
|
=cut |
|
|
|
sub construct_path_to_user_directory ($$) |
|
{ |
|
my ($user_domain,$username) = @_; |
|
|
|
# Untaint. |
|
$user_domain =~ s/\W//g; |
|
$username =~ s/\W//g; |
|
|
|
# Create three levels of sub-directoried filesystem path |
|
# based on the first three characters of the username. |
|
my $sub_filesystem_path = $username.'__'; |
|
$sub_filesystem_path =~ s!(.)(.)(.).*!$1/$2/$3/!; |
|
|
|
# Use the sub-directoried levels and other variables to generate |
|
# the complete filesystem path. |
|
my $complete_filesystem_path = |
|
join('/',($perlvar{'lonUsersDir'}, |
|
$user_domain, |
|
$sub_filesystem_path, |
|
$username)); |
|
|
|
# Return the complete filesystem path. |
|
return($complete_filesystem_path); |
|
} |
|
|
|
=pod |
|
|
|
B<sql_formatted_time> (@) - turns seconds since epoch into datetime sql format. |
|
|
|
=over 4 |
|
|
|
Parameters: |
|
|
|
=item I<$epochtime> - time in seconds since epoch (may need to be sanitized). |
|
|
|
=back |
|
|
|
=over 4 |
# ------------------------------------------------------------ Serves up a file |
|
# returns either the contents of the file or a -1 |
|
sub getfile { |
|
my $file=shift; |
|
if (! -e $file ) { return -1; }; |
|
my $fh=IO::File->new($file); |
|
my $a=''; |
|
while (<$fh>) { $a .=$_; } |
|
return $a |
|
} |
|
|
Returns: |
# ------------------------------------------------------------- Declutters URLs |
|
sub declutter { |
|
my $thisfn=shift; |
|
$thisfn=~s/^$perlvar{'lonDocRoot'}//; |
|
$thisfn=~s/^\///; |
|
$thisfn=~s/^res\///; |
|
return $thisfn; |
|
} |
|
|
=item C<string> - datetime sql formatted string. |
# --------------------------------------- Is this the home server of an author? |
|
# (copied from lond, modification of the return value) |
|
sub ishome { |
|
my $author=shift; |
|
$author=~s/\/home\/httpd\/html\/res\/([^\/]*)\/([^\/]*).*/$1\/$2/; |
|
my ($udom,$uname)=split(/\//,$author); |
|
my $proname=propath($udom,$uname); |
|
if (-e $proname) { |
|
return 1; |
|
} else { |
|
return 0; |
|
} |
|
} |
|
|
=back |
# -------------------------------------------- Return path to profile directory |
|
# (copied from lond) |
|
sub propath { |
|
my ($udom,$uname)=@_; |
|
$udom=~s/\W//g; |
|
$uname=~s/\W//g; |
|
my $subdir=$uname.'__'; |
|
$subdir =~ s/(.)(.)(.).*/$1\/$2\/$3/; |
|
my $proname="$perlvar{'lonUsersDir'}/$udom/$subdir/$uname"; |
|
return $proname; |
|
} |
|
|
=cut |
# ---------------------------- convert 'time' format into a datetime sql format |
|
sub sqltime { |
sub sql_formatted_time ($) |
|
{ |
|
# Sanitize the time argument and convert to localtime array. |
|
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = |
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = |
localtime(&sanitize_time(shift(@_))); |
localtime(&unsqltime(@_[0])); |
|
$mon++; $year+=1900; |
# Convert month from (0..11) to (1..12). |
return "$year-$mon-$mday $hour:$min:$sec"; |
$mon += 1; |
} |
|
|
# Make the year compatible with A.D. specification. |
|
$year += 1900; |
|
|
|
# Return a date which is compatible with MySQL's "DATETIME" format. |
|
return(join('-',($year,$mon,$mday)). |
|
' '. |
|
join(':',($hour,$min,$sec)) |
|
); |
|
} |
|
|
|
|
|
# ==================================== The following two subroutines are needed |
|
# for accommodating incorrect time formats inside the metadata. |
|
|
|
=pod |
|
|
|
B<make_seconds_since_epoch> (@) - turns time metadata into seconds since epoch. |
|
|
|
=over 4 |
|
|
|
Parameters: |
|
|
|
=item I<%time_metadata> - a key-value listing characterizing month, year, etc. |
|
|
|
=back |
|
|
|
=over 4 |
|
|
|
Returns: |
|
|
|
=item C<integer> - seconds since epoch. |
|
|
|
=back |
|
|
|
=cut |
|
|
|
sub make_seconds_since_epoch (@) |
|
{ |
|
# Keytable of time metadata. |
|
my %time_metadata = @_; |
|
|
|
# Return seconds since the epoch (January 1, 1970, 00:00:00 UTC). |
|
return(POSIX::mktime( |
|
($time_metadata{'seconds'}, |
|
$time_metadata{'minutes'}, |
|
$time_metadata{'hours'}, |
|
$time_metadata{'day'}, |
|
$time_metadata{'month'}-1, |
|
$time_metadata{'year'}-1900, |
|
0, |
|
0, |
|
$time_metadata{'dlsav'}) |
|
) |
|
); |
|
} |
|
|
|
=pod |
|
|
|
B<sanitize_time> - if time looks sql-formatted, make it seconds since epoch. |
|
|
|
Somebody described this subroutine as |
|
"retro-fixing of un-backward-compatible time format". |
|
|
|
What this means, is that a part of this code expects to get UTC seconds |
|
since the epoch (beginning of 1970). Yet, some of the .meta files have |
|
sql-formatted time strings (2001-04-01, etc.) instead of seconds-since-epoch |
|
integers (e.g. 1044147435). These time strings do not encode the timezone |
|
and, in this sense, can be considered "un-backwards-compatible". |
|
|
|
=over 4 |
|
|
|
Parameters: |
|
|
|
=item I<$potentially_badformat_string> - string to "retro-fix". |
|
|
|
=back |
|
|
|
=over 4 |
|
|
|
Returns: |
|
|
|
=item C<integer> - seconds since epoch. |
|
|
|
=back |
|
|
|
=cut |
|
|
|
sub sanitize_time ($) |
|
{ |
|
my $timestamp = shift(@_); |
|
# If timestamp is in this unexpected format.... |
|
if ($timestamp =~ /^(\d+)\-(\d+)\-(\d+)\s+(\d+)\:(\d+)\:(\d+)$/) |
|
{ |
|
# then convert into seconds since epoch (the expected format). |
|
$timestamp = &make_seconds_since_epoch( |
|
'year' => $1, |
|
'month' => $2, |
|
'day' => $3, |
|
'hours' => $4, |
|
'minutes' => $5, |
|
'seconds' => $6 |
|
); |
|
} |
|
# Otherwise we assume timestamp to be as expected. |
|
return($timestamp); |
|
} |
|
|
|
=pod |
|
|
|
=head1 AUTHOR |
sub maketime { |
|
my %th=@_; |
|
return POSIX::mktime( |
|
($th{'seconds'},$th{'minutes'},$th{'hours'}, |
|
$th{'day'},$th{'month'}-1,$th{'year'}-1900,0,0,$th{'dlsav'})); |
|
} |
|
|
Written to help the loncapa project. |
|
|
|
Scott Harrison, sharrison@users.sourceforge.net |
######################################### |
|
# |
|
# Retro-fixing of un-backward-compatible time format |
|
|
This is distributed under the same terms as loncapa (i.e. "freeware"). |
sub unsqltime { |
|
my $timestamp=shift; |
|
if ($timestamp=~/^(\d+)\-(\d+)\-(\d+)\s+(\d+)\:(\d+)\:(\d+)$/) { |
|
$timestamp=&maketime( |
|
'year'=>$1,'month'=>$2,'day'=>$3, |
|
'hours'=>$4,'minutes'=>$5,'seconds'=>$6); |
|
} |
|
return $timestamp; |
|
} |
|
|
=cut |
|