--- loncom/metadata_database/searchcat.pl 2003/02/03 17:01:55 1.30 +++ loncom/metadata_database/searchcat.pl 2003/10/08 14:15:03 1.42 @@ -2,7 +2,7 @@ # The LearningOnline Network # searchcat.pl "Search Catalog" batch script # -# $Id: searchcat.pl,v 1.30 2003/02/03 17:01:55 www Exp $ +# $Id: searchcat.pl,v 1.42 2003/10/08 14:15:03 www Exp $ # # Copyright Michigan State University Board of Trustees # @@ -26,15 +26,6 @@ # # http://www.lon-capa.org/ # -# YEAR=2001 -# 04/14/2001, 04/16/2001 Scott Harrison -# -# YEAR=2002 -# 05/11/2002 Scott Harrison -# -# YEAR=2003 -# Scott Harrison -# ### =pod @@ -52,7 +43,6 @@ filesystem installation location: F - C<10 1 * * 7 www /home/httpd/perl/searchcat.pl> This script only allows itself to be run as the user C. @@ -65,8 +55,9 @@ The metadata is entered into a SQL datab This script also does general database maintenance such as reformatting the C table if it is deprecated. -This script also builds dynamic temporal metadata and stores this inside -a F database file. +This script evaluates dynamic metadata from the authors' +F database file in order to store it in MySQL, as +well as to compress the filesize (add up all "count"-type metadata). This script is playing an increasingly important role for a loncapa library server. The proper operation of this script is critical for a smooth @@ -74,10 +65,6 @@ and correct user experience. =cut -# ========================================================== Setting things up. - -# ------------------------------------------------------ Use external modules. - use lib '/home/httpd/lib/perl/'; use LONCAPA::Configuration; @@ -87,866 +74,406 @@ use DBI; use GDBM_File; use POSIX qw(strftime mktime); -# ----------------- Code to enable 'find' subroutine listing of the .meta files -use File::Find; - -# List of .meta files (used on a per-user basis). my @metalist; -# --------------- Read loncapa_apache.conf and loncapa.conf and get variables. -my $perlvarref = LONCAPA::Configuration::read_conf('loncapa.conf'); -my %perlvar = %{$perlvarref}; -undef($perlvarref); # Remove since sensitive and not needed. -delete($perlvar{'lonReceipt'}); # Remove since sensitive and not needed. - -# ------------------------------------- Only run if machine is a library server -if ($perlvar{'lonRole'} ne 'library') - { - exit(0); - } - -# ------------------------------ Make sure this process is running as user=www. -my $wwwid = getpwnam('www'); -if ($wwwid != $<) - { - $emailto = "$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}"; - $subj = "LON: $perlvar{'lonHostID'} User ID mismatch"; - system("echo 'User ID mismatch. searchcat.pl must be run as user www.' | ". - "mailto $emailto -s '$subj' > /dev/null"); - exit(1); - } - -# ------------------------------------------------------ Initialize log output. -open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log'); -print(LOG '==== Searchcat Run '.localtime().' ===='."\n\n"); - -my $dbh; # Database object reference handle. - -# ----------------------------- Verify connection to loncapa:metadata database. -unless ( - $dbh = DBI->connect('DBI:mysql:loncapa','www', - $perlvar{'lonSqlAccess'}, - { RaiseError => 0,PrintError => 0}) - ) - { - print(LOG '**** ERROR **** Cannot connect to database!'."\n"); - exit(0); - } - -# ------------------------------ Create loncapa:metadata table if non-existent. -my $make_metadata_table = 'CREATE TABLE IF NOT EXISTS metadata ('. - 'title TEXT, author TEXT, subject TEXT, url TEXT, keywords TEXT, '. - 'version TEXT, notes TEXT, abstract TEXT, mime TEXT, language TEXT, '. - 'creationdate DATETIME, lastrevisiondate DATETIME, owner TEXT, '. - 'copyright TEXT, utilitysemaphore BOOL, FULLTEXT idx_title (title), '. - 'FULLTEXT idx_author (author), FULLTEXT idx_subject (subject), '. - 'FULLTEXT idx_url (url), FULLTEXT idx_keywords (keywords), '. - 'FULLTEXT idx_version (version), FULLTEXT idx_notes (notes), '. - 'FULLTEXT idx_abstract (abstract), FULLTEXT idx_mime (mime), '. - 'FULLTEXT idx_language (language), FULLTEXT idx_owner (owner), '. - 'FULLTEXT idx_copyright (copyright)) TYPE=MYISAM'; - -$dbh->do($make_metadata_table); # Generate the table. - -# ----------------------------- Verify format of the loncapa:metadata database. -# (delete and recreate database if necessary). - -# Make a positive control for verifying table structure. -my $make_metadata_table_CONTROL = $make_metadata_table; -$make_metadata_table_CONTROL =~ - s/^(CREATE TABLE IF NOT EXISTS) metadata/$1 CONTROL_metadata/; - -$dbh->do('DROP TABLE IF EXISTS CONTROL_metadata'); -$dbh->do($make_metadata_table_CONTROL); - -my $table_description; # selectall reference to the table description. - -my $CONTROL_table_string; # What the table description should look like. -my $table_string; # What the table description does look like. - -# Calculate the CONTROL table description (what it should be). -$table_description = $dbh->selectall_arrayref('describe CONTROL_metadata'); -foreach my $table_row (@{$table_description}) - { - $CONTROL_table_string .= join(',',@{$table_row})."\n"; - } - -# Calculate the current table description (what it currently looks like). -$table_description = $dbh->selectall_arrayref('describe metadata'); -foreach my $table_row (@{$table_description}) - { - $table_string .= join(',',@{$table_row})."\n"; - } - -if ($table_string ne $CONTROL_table_string) - { - # Log this incident. - print(LOG '**** WARNING **** Table structure mismatch, need to regenerate'. - '.'."\n"); - # Delete the table. - $dbh->do('DROP TABLE IF EXISTS metadata'); - # Generate the table. - $dbh->do($make_metadata_table); - } - -$dbh->do('DROP TABLE IF EXISTS CONTROL_metadata'); # Okay. Done with control. - -# ----------------------------------------------- Set utilitysemaphore to zero. -$dbh->do('UPDATE metadata SET utilitysemaphore = 0'); - -# ========================================================= Main functionality. - -# - Determine home authors on this server based on resources dir and user tree. - -# RESOURCES: the resources directory (subdirs correspond to author usernames). -opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}") or - (print(LOG '=== /res/--lonDefDomain-- directory is not accessible'."\n") - and exit(0)); - -# query_home_server_status will look for user home directories on this machine. -my @homeusers = - grep {&query_home_server_status($perlvar{'lonDocRoot'}.'/res/'. - $perlvar{'lonDefDomain'}.'/'.$_) - } grep {!/^\.\.?$/} readdir(RESOURCES); -closedir(RESOURCES); - -unless (@homeusers) - { - print(LOG '=== No home users found on this server.'."\n"); - } - -# Consider each author individually. -foreach my $user (@homeusers) - { - # Make a log entry. - print(LOG "\n".'=== User: '.$user."\n\n"); - - # Get filesystem path to this user's directory. - my $user_directory = - &construct_path_to_user_directory($perlvar{'lonDefDomain'},$user); - - # Remove left-over db-files from a potentially crashed searchcat run. - unlink($user_directory.'/nohist_new_resevaldata.db'); - - # Cleanup the metalist array. - undef(@metalist); - @metalist = (); - - # This will add entries to the @metalist array. - &File::Find::find(\&wanted, - $perlvar{'lonDocRoot'}.'/res/'. - $perlvar{'lonDefDomain'}.'/'.$user); - - # -- process file to get metadata and put into search catalog SQL database - # Also, build and store dynamic metadata. - # Also, delete record entries before refreshing. - foreach my $m (@metalist) - { - # Log this action. - print(LOG "- ".$m."\n"); - - # Get metadata from the file. - my $ref = get_metadata_from_file($m); - - # Make a datarecord identifier for this resource. - my $m2 = '/res/'.declutter($m); - $m2 =~ s/\.meta$//; - - # Build and store dynamic metadata inside nohist_resevaldata.db. - build_on_the_fly_dynamic_metadata($m2); - - # Delete record if it already exists. - my $q2 = 'select * from metadata where url like binary '."'".$m2."'"; - my $sth = $dbh->prepare($q2); - $sth->execute(); - my $r1 = $sth->fetchall_arrayref; - if (@$r1) - { - $sth = - $dbh->prepare('delete from metadata where url like binary '. - "'".$m2."'"); - $sth->execute(); - } - - # Add new/replacement record into the loncapa:metadata table. - $sth = $dbh->prepare('insert into metadata values ('. - '"'.delete($ref->{'title'}).'"'.','. - '"'.delete($ref->{'author'}).'"'.','. - '"'.delete($ref->{'subject'}).'"'.','. - '"'.$m2.'"'.','. - '"'.delete($ref->{'keywords'}).'"'.','. - '"'.'current'.'"'.','. - '"'.delete($ref->{'notes'}).'"'.','. - '"'.delete($ref->{'abstract'}).'"'.','. - '"'.delete($ref->{'mime'}).'"'.','. - '"'.delete($ref->{'language'}).'"'.','. - '"'.sql_formatted_time( - delete($ref->{'creationdate'})).'"'.','. - '"'.sql_formatted_time( - delete($ref->{'lastrevisiondate'})).'"'.','. - '"'.delete($ref->{'owner'}).'"'.','. - '"'.delete($ref->{'copyright'}).'"'.','. - '1'.')'); - $sth->execute(); - } - -# ----------------------- Clean up database, remove stale SQL database records. - $dbh->do('DELETE FROM metadata WHERE utilitysemaphore = 0'); - -# -------------------------------------------------- Copy over the new db-files - system('mv '.$user_directory.'/nohist_new_resevaldata.db '. - $user_directory.'/nohist_resevaldata.db'); - } - -# --------------------------------------------------- Close database connection -$dbh->disconnect; -print LOG "\n==== Searchcat completed ".localtime()." ====\n"; -close(LOG); -exit(0); - -# ================================================================ Subroutines. - -=pod - -=head1 SUBROUTINES - -=cut - -=pod - -B - translate to unstrange escaped syntax to strange characters. - -=over 4 - -Parameters: +$simplestatus=''; +my %countext=(); -=item I<$str> - string with unweird characters. - -=back - -=over 4 - -Returns: - -=item C - string with potentially weird characters. +sub writesimple { + open(SMP,'>/home/httpd/html/lon-status/mysql.txt'); + print SMP $simplestatus."\n"; + close(SMP); +} -=back +sub writecount { + open(RSMP,'>/home/httpd/html/lon-status/rescount.txt'); + foreach (keys %countext) { + print RSMP $_.'='.$countext{$_}.'&'; + } + print RSMP 'time='.time."\n"; + close(RSMP); +} -=cut +sub count { + my $file=shift; + $file=~/\.(\w+)$/; + my $ext=lc($1); + if (defined($countext{$ext})) { + $countext{$ext}++; + } else { + $countext{$ext}=1; + } +} +# ----------------------------------------------------- Un-Escape Special Chars -sub unescape ($) - { - my $str = shift(@_); +sub unescape { + my $str=shift; $str =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg; - return($str); - } - -=pod - -B - translate strange characters to unstrange escaped syntax. - -=over 4 - -Parameters: - -=item I<$str> - string with potentially weird characters to unweird-ify. - -=back - -=over 4 - -Returns: - -=item C - unweird-ified string. + return $str; +} -=back +# -------------------------------------------------------- Escape Special Chars -=cut - -sub escape ($) - { - my $str = shift(@_); +sub escape { + my $str=shift; $str =~ s/(\W)/"%".unpack('H2',$1)/eg; - return($str); - } - -=pod - -B - evaluate and store dynamic metadata. - -Returns the dynamic metadata for an author, which will later be added to the -MySQL database (not yet implemented). - -The vast majority of entries in F, which contains -the dynamic metadata for an author's resources, are "count", which make -the file really large and evaluation really slow. - -While computing the current value of all dynamic metadata -for later insertion into the MySQL metadata cache (not yet implemented), -this routine also simply adds up all "count" type fields and replaces them by -one new field with the to-date count. - -Only after successful completion of working with one author, copy new file to -original file. Copy to tmp-"new"-db-file was necessary since db-file size -would not shrink after "delete" of key. - -=over 4 - -Parameters: - -=item I<$url> - the filesystem path (url may be a misnomer...) - -=back - -=over 4 - -Returns: - -=item C - key-value table of dynamically evaluated metadata. + return $str; +} -=back -=cut +# ------------------------------------------- Code to evaluate dynamic metadata -sub build_on_the_fly_dynamic_metadata { +sub dynamicmeta { - # Need to compute the user's directory. my $url=&declutter(shift); $url=~s/\.meta$//; my %returnhash=(); my ($adomain,$aauthor)=($url=~/^(\w+)\/(\w+)\//); - my $user_directory=&construct_path_to_user_directory($adomain,$aauthor); - - # Attempt a GDBM database instantiation inside users directory and proceed. + my $prodir=&propath($adomain,$aauthor); if ((tie(%evaldata,'GDBM_File', - $user_directory. - '/nohist_resevaldata.db',&GDBM_READER(),0640)) && + $prodir.'/nohist_resevaldata.db',&GDBM_READER(),0640)) && (tie(%newevaldata,'GDBM_File', - $user_directory. - '/nohist_new_resevaldata.db',&GDBM_WRCREAT(),0640))) { - # For different variables, track the running sum and counts. - my %sum=(); - my %cnt=(); - - # Define computed items as a sum (add) or an average (avg) or a raw - # count (cnt) or append (app)? - my %listitems=('count' => 'add', - 'course' => 'add', - 'avetries' => 'avg', - 'stdno' => 'add', - 'difficulty' => 'avg', - 'clear' => 'avg', - 'technical' => 'avg', - 'helpful' => 'avg', - 'correct' => 'avg', - 'depth' => 'avg', - 'comments' => 'app', - 'usage' => 'cnt' - ); - - # Untaint the url and use as part of a regular expression. - my $regexp=$url; - $regexp=~s/(\W)/\\$1/g; - $regexp='___'.$regexp.'___([a-z]+)$'; #' emacs - - # Check existing database for this author. - # this is modifying the 'count' entries - # and copying all other entries over - - foreach (keys %evaldata) { - my $key=&unescape($_); - if ($key=~/$regexp/) { # If url-based entry exists. - my $ctype=$1; # Set to specific category type. - - # Do an increment for this category type. - if (defined($cnt{$ctype})) { - $cnt{$ctype}++; - } else { - $cnt{$ctype}=1; - } - unless ($listitems{$ctype} eq 'app') { # append comments - # Increment the sum based on the evaluated data in the db. - if (defined($sum{$ctype})) { - $sum{$ctype}+=$evaldata{$_}; - } else { - $sum{$ctype}=$evaldata{$_}; - } - } else { # 'app' mode, means to use '
' as a separator - if (defined($sum{$ctype})) { - if ($evaldata{$_}) { - $sum{$ctype}.='
'.$evaldata{$_}; - } - } else { - $sum{$ctype}=''.$evaldata{$_}; - } - } - if ($ctype ne 'count') { - # this is copying all data except 'count' attributes - $newevaldata{$_}=$evaldata{$_}; - } - } - } - - # these values will be returned (currently still unused) - foreach (keys %cnt) { - if ($listitems{$_} eq 'avg') { - $returnhash{$_}=int(($sum{$_}/$cnt{$_})*100.0+0.5)/100.0; - } elsif ($listitems{$_} eq 'cnt') { - $returnhash{$_}=$cnt{$_}; - } else { - $returnhash{$_}=$sum{$_}; + $prodir.'/nohist_new_resevaldata.db',&GDBM_WRCREAT(),0640))) { + my %sum=(); + my %cnt=(); + my %listitems=('count' => 'add', + 'course' => 'add', + 'goto' => 'add', + 'comefrom' => 'add', + 'avetries' => 'avg', + 'stdno' => 'add', + 'difficulty' => 'avg', + 'clear' => 'avg', + 'technical' => 'avg', + 'helpful' => 'avg', + 'correct' => 'avg', + 'depth' => 'avg', + 'comments' => 'app', + 'usage' => 'cnt' + ); + my $regexp=$url; + $regexp=~s/(\W)/\\$1/g; + $regexp='___'.$regexp.'___([a-z]+)$'; + while (my ($key,$value)=each %evaldata) { + $key=&unescape($key); + if ($key=~/$regexp/) { + my $ctype=$1; + if (defined($cnt{$ctype})) { + $cnt{$ctype}++; + } else { + $cnt{$ctype}=1; + } + unless ($listitems{$ctype} eq 'app') { + if (defined($sum{$ctype})) { + $sum{$ctype}+=$value; + } else { + $sum{$ctype}=$value; + } + } else { + if (defined($sum{$ctype})) { + if ($value) { + $sum{$ctype}.='
'.$value; + } + } else { + $sum{$ctype}=''.$value; + } } - } - - # generate new count key in resevaldata, insert sum - if ($returnhash{'count'}) { - my $newkey=$$.'_'.time.'_searchcat___'.&escape($url).'___count'; - $newevaldata{$newkey}=$returnhash{'count'}; - } - - untie(%evaldata); # Close/release the original nohist database. - untie(%newevaldata); # Close/release the new nohist database. - } - return %returnhash; + if ($ctype ne 'count') { + $newevaldata{$_}=$value; + } + } + } + foreach (keys %cnt) { + if ($listitems{$_} eq 'avg') { + $returnhash{$_}=int(($sum{$_}/$cnt{$_})*100.0+0.5)/100.0; + } elsif ($listitems{$_} eq 'cnt') { + $returnhash{$_}=$cnt{$_}; + } else { + $returnhash{$_}=$sum{$_}; + } + } + if ($returnhash{'count'}) { + my $newkey=$$.'_'.time.'_searchcat___'.&escape($url).'___count'; + $newevaldata{$newkey}=$returnhash{'count'}; + } + untie(%evaldata); + untie(%newevaldata); + } + return %returnhash; } - -=pod - -B - used by B subroutine. - -This evaluates whether a file is wanted, and pushes it onto the -I<@metalist> array. This subroutine was, for the most part, auto-generated -by the B command. - -=over 4 - -Parameters: - -=item I<$file> - a path to the file. - -=back - -=over 4 - -Returns: - -=item C - true or false based on logical statement. - -=back - -=cut - -sub wanted ($) - { + +# ----------------- Code to enable 'find' subroutine listing of the .meta files +require "find.pl"; +sub wanted { (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) && - -f $_ && - /^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ && - push(@metalist,$File::Find::dir.'/'.$_); - } - -=pod - -B - read xml-tagged file and return parsed metadata. - -I - -=over 4 - -Parameters: - -=item I<$file> - a path.to the file. - -=back - -=over 4 - -Returns: - -=item C - a hash array (keys and values). - -=back - -=cut - -sub get_metadata_from_file ($) - { - my ($filename) = @_; - my %metatable; # Used to store return value of hash-tabled metadata. - $filename = &declutter($filename); # Remove non-identifying filesystem info - my $uri = ''; # The URI is not relevant in this scenario. - unless ($filename =~ m/\.meta$/) # Unless ending with .meta. - { - $filename .= '.meta'; # Append a .meta suffix. - } - # Get the file contents. - my $metadata_string = - &get_file_contents($perlvar{'lonDocRoot'}.'/res/'.$filename); - - # Parse the file based on its XML tags. - my $parser = HTML::TokeParser->new(\$metadata_string); - my $token; - while ($token = $parser->get_token) # Loop through tokens. - { - if ($token->[0] eq 'S') # If it is a start token. - { - my $entry = $token->[1]; - my $unikey = $entry; # A unique identifier for this xml tag key. - if (defined($token->[2]->{'part'})) - { - $unikey .= '_'.$token->[2]->{'part'}; - } - if (defined($token->[2]->{'name'})) - { - $unikey .= '_'.$token->[2]->{'name'}; - } - # Append $unikey to metatable's keys entry. - if ($metatable{$uri.'keys'}) - { - $metatable{$uri.'keys'} .= ','.$unikey; - } - else - { - $metatable{$uri.'keys'} = $unikey; - } - # Insert contents into metatable entry for the unikey. - foreach my $t3 (@{$token->[3]}) - { - $metatable{$uri.''.$unikey.'.'.$_} = $token->[2]->{$t3}; - } - # If there was no text contained inside the tags, set = default. - unless - ( - $metatable{$uri.''.$unikey} = $parser->get_text('/'.$entry) - ) - { - $metatable{$uri.''.$unikey} = - $metatable{$uri.''.$unikey.'.default'}; - } - } - } - # Return with a key-value table of XML tags and their tag contents. - return(\%metatable); - } - -=pod - -B - returns either the contents of the file or a -1. - -=over 4 - -Parameters: - -=item I<$file> - a complete filesystem path.to the file. - -=back - -=over 4 - -Returns: - -=item C - file contents or a -1. - -=back - -=cut - -sub get_file_contents ($) - { - my $file = shift(@_); - - # If file does not exist, then return a -1 value. - unless (-e $file) - { - return(-1); - } - - # Read in file contents. - my $file_handle = IO::File->new($file); - my $file_contents = ''; - while (<$file_handle>) - { - $file_contents .= $_; - } - - # Return file contents. - return($file_contents); - } - -=pod - -B - Declutters URLs (remove extraneous prefixed filesystem path). - -=over 4 - -Parameters: - -=item I<$filesystem_path> - a complete filesystem path. - -=back - -=over 4 - -Returns: - -=item C - remnants of the filesystem path (beginning portion removed). - -=back - -=cut - -sub declutter - { - my $filesystem_path = shift(@_); - - # Remove beginning portions of the filesystem path. - $filesystem_path =~ s/^$perlvar{'lonDocRoot'}//; - $filesystem_path =~ s!^/!!; - $filesystem_path =~ s!^res/!!; - - # Return what is remaining for the filesystem path. - return($filesystem_path); - } - -=pod - -B - Is this the home server of an author's directory? - -=over 4 - -Parameters: - -=item I<$author_filesystem_path> - directory path for a user. - -=back - -=over 4 - -Returns: - -=item C - 1 if true; 0 if false. - -=back - -=cut - -sub query_home_server_status ($) - { - my $author_filesystem_path = shift(@_); - - # Remove beginning portion of this filesystem path. - $author_filesystem_path =~ s!/home/httpd/html/res/([^/]*)/([^/]*).*!$1/$2!; - - # Construct path to the author's ordinary user directory. - my ($user_domain,$username) = split(m!/!,$author_filesystem_path); - my $user_directory_path = construct_path_to_user_directory($user_domain, - $username); - - # Return status of whether the user directory path is defined. - if (-e $user_directory_path) - { - return(1); # True. - } - else - { - return(0); # False. - } - } - -=pod - -B ($$) - makes a filesystem path to user dir. - -=over 4 - -Parameters: - -=item I<$user_domain> - the loncapa domain of the user. - -=item I<$username> - the unique username (user id) of the user. + -f _ && + /^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ && + push(@metalist,"$dir/$_"); +} -=back +# --------------- Read loncapa_apache.conf and loncapa.conf and get variables +my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf'); +my %perlvar=%{$perlvarref}; +undef $perlvarref; # remove since sensitive and not needed +delete $perlvar{'lonReceipt'}; # remove since sensitive and not needed -=over 4 +# ------------------------------------- Only run if machine is a library server +exit unless $perlvar{'lonRole'} eq 'library'; -Returns: +# ----------------------------- Make sure this process is running from user=www -=item C - representing the path on the filesystem. +my $wwwid=getpwnam('www'); +if ($wwwid!=$<) { + $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}"; + $subj="LON: $perlvar{'lonHostID'} User ID mismatch"; + system("echo 'User ID mismatch. searchcat.pl must be run as user www.' |\ + mailto $emailto -s '$subj' > /dev/null"); + exit 1; +} -=back -=cut +# ---------------------------------------------------------- We are in business -sub construct_path_to_user_directory ($$) - { - my ($user_domain,$username) = @_; - - # Untaint. - $user_domain =~ s/\W//g; - $username =~ s/\W//g; - - # Create three levels of sub-directoried filesystem path - # based on the first three characters of the username. - my $sub_filesystem_path = $username.'__'; - $sub_filesystem_path =~ s!(.)(.)(.).*!$1/$2/$3/!; - - # Use the sub-directoried levels and other variables to generate - # the complete filesystem path. - my $complete_filesystem_path = - join('/',($perlvar{'lonUsersDir'}, - $user_domain, - $sub_filesystem_path, - $username)); - - # Return the complete filesystem path. - return($complete_filesystem_path); - } +open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log'); +print LOG '==== Searchcat Run '.localtime()."====\n\n"; +$simplestatus='time='.time.'&'; +my $dbh; +# ------------------------------------- Make sure that database can be accessed +{ + unless ( + $dbh = DBI->connect("DBI:mysql:loncapa","www",$perlvar{'lonSqlAccess'},{ RaiseError =>0,PrintError=>0}) + ) { + print LOG "Cannot connect to database!\n"; + $simplestatus.='mysql=defunct'; + &writesimple(); + exit; + } -=pod + my $make_metadata_table = "CREATE TABLE IF NOT EXISTS metadata (". + "title TEXT, author TEXT, subject TEXT, url TEXT, keywords TEXT, ". + "version TEXT, notes TEXT, abstract TEXT, mime TEXT, language TEXT, ". + "creationdate DATETIME, lastrevisiondate DATETIME, owner TEXT, ". + "copyright TEXT, FULLTEXT idx_title (title), ". + "FULLTEXT idx_author (author), FULLTEXT idx_subject (subject), ". + "FULLTEXT idx_url (url), FULLTEXT idx_keywords (keywords), ". + "FULLTEXT idx_version (version), FULLTEXT idx_notes (notes), ". + "FULLTEXT idx_abstract (abstract), FULLTEXT idx_mime (mime), ". + "FULLTEXT idx_language (language), FULLTEXT idx_owner (owner), ". + "FULLTEXT idx_copyright (copyright)) TYPE=MYISAM"; + # It would sure be nice to have some logging mechanism. + $dbh->do($make_metadata_table); +} -B (@) - turns seconds since epoch into datetime sql format. +# ------------------------------------------------------------- get .meta files +opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}"); +my @homeusers = grep { + &ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_") + } grep {!/^\.\.?$/} readdir(RESOURCES); +closedir RESOURCES; + +# +# Create the statement handlers we need +my $delete_sth = $dbh->prepare + ("DELETE FROM metadata WHERE url LIKE BINARY ?"); + +my $insert_sth = $dbh->prepare + ("INSERT INTO metadata VALUES (". + "?,". # title + "?,". # author + "?,". # subject + "?,". # m2??? + "?,". # version + "?,". # current + "?,". # notes + "?,". # abstract + "?,". # mime + "?,". # language + "?,". # creationdate + "?,". # revisiondate + "?,". # owner + "?)" # copyright + ); + +foreach my $user (@homeusers) { + print LOG "\n=== User: ".$user."\n\n"; + # Remove left-over db-files from potentially crashed searchcat run + my $prodir=&propath($perlvar{'lonDefDomain'},$user); + unlink($prodir.'/nohist_new_resevaldata.db'); + # Use find.pl + undef @metalist; + @metalist=(); + &find("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$user"); + # -- process each file to get metadata and put into search catalog SQL + # database. Also, check to see if already there. + # I could just delete (without searching first), but this works for now. + foreach my $m (@metalist) { + print LOG "- ".$m."\n"; + my $ref=&metadata($m); + my $m2='/res/'.&declutter($m); + $m2=~s/\.meta$//; + &dynamicmeta($m2); + if ($ref->{'obsolete'}) { print LOG "obsolete\n"; next; } + if ($ref->{'copyright'} eq 'private') { print LOG "private\n"; next; } + &count($m2); + $delete_sth->execute($m2); + $insert_sth->execute($ref->{'title'}, + $ref->{'author'}, + $ref->{'subject'}, + $m2, + $ref->{'keywords'}, + 'current', + $ref->{'notes'}, + $ref->{'abstract'}, + $ref->{'mime'}, + $ref->{'language'}, + sqltime($ref->{'creationdate'}), + sqltime($ref->{'lastrevisiondate'}), + $ref->{'owner'}, + $ref->{'copyright'}); +# if ($dbh->err()) { +# print STDERR "Error:".$dbh->errstr()."\n"; +# } + $ref = undef; + } + + # --------------------------------------------------- Clean up database + # Need to, perhaps, remove stale SQL database records. + # ... not yet implemented + + # ------------------------------------------- Copy over the new db-files + # -=over 4 + system('mv '.$prodir.'/nohist_new_resevaldata.db '. + $prodir.'/nohist_resevaldata.db'); -Parameters: +} +# --------------------------------------------------- Close database connection +$dbh->disconnect; +print LOG "\n==== Searchcat completed ".localtime()." ====\n"; +close(LOG); +&writesimple(); +&writecount(); +exit 0; -=item I<$epochtime> - time in seconds since epoch (may need to be sanitized). -=back -=over 4 +# ============================================================================= + +# ---------------------------------------------------------------- Get metadata +# significantly altered from subroutine present in lonnet +sub metadata { + my ($uri,$what)=@_; + my %metacache; + $uri=&declutter($uri); + my $filename=$uri; + $uri=~s/\.meta$//; + $uri=''; + unless ($metacache{$uri.'keys'}) { + unless ($filename=~/\.meta$/) { $filename.='.meta'; } + my $metastring=&getfile($perlvar{'lonDocRoot'}.'/res/'.$filename); + my $parser=HTML::TokeParser->new(\$metastring); + my $token; + while ($token=$parser->get_token) { + if ($token->[0] eq 'S') { + my $entry=$token->[1]; + my $unikey=$entry; + if (defined($token->[2]->{'part'})) { + $unikey.='_'.$token->[2]->{'part'}; + } + if (defined($token->[2]->{'name'})) { + $unikey.='_'.$token->[2]->{'name'}; + } + if ($metacache{$uri.'keys'}) { + $metacache{$uri.'keys'}.=','.$unikey; + } else { + $metacache{$uri.'keys'}=$unikey; + } + map { + $metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_}; + } @{$token->[3]}; + unless ( + $metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry) + ) { $metacache{$uri.''.$unikey}= + $metacache{$uri.''.$unikey.'.default'}; + } + } + } + } + return \%metacache; +} -Returns: +# ------------------------------------------------------------ Serves up a file +# returns either the contents of the file or a -1 +sub getfile { + my $file=shift; + if (! -e $file ) { return -1; }; + my $fh=IO::File->new($file); + my $a=''; + while (<$fh>) { $a .=$_; } + return $a; +} -=item C - datetime sql formatted string. +# ------------------------------------------------------------- Declutters URLs +sub declutter { + my $thisfn=shift; + $thisfn=~s/^$perlvar{'lonDocRoot'}//; + $thisfn=~s/^\///; + $thisfn=~s/^res\///; + return $thisfn; +} -=back +# --------------------------------------- Is this the home server of an author? +# (copied from lond, modification of the return value) +sub ishome { + my $author=shift; + $author=~s/\/home\/httpd\/html\/res\/([^\/]*)\/([^\/]*).*/$1\/$2/; + my ($udom,$uname)=split(/\//,$author); + my $proname=propath($udom,$uname); + if (-e $proname) { + return 1; + } else { + return 0; + } +} -=cut +# -------------------------------------------- Return path to profile directory +# (copied from lond) +sub propath { + my ($udom,$uname)=@_; + $udom=~s/\W//g; + $uname=~s/\W//g; + my $subdir=$uname.'__'; + $subdir =~ s/(.)(.)(.).*/$1\/$2\/$3/; + my $proname="$perlvar{'lonUsersDir'}/$udom/$subdir/$uname"; + return $proname; +} -sub sql_formatted_time ($) - { - # Sanitize the time argument and convert to localtime array. +# ---------------------------- convert 'time' format into a datetime sql format +sub sqltime { my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = - localtime(&sanitize_time(shift(@_))); - - # Convert month from (0..11) to (1..12). - $mon += 1; - - # Make the year compatible with A.D. specification. - $year += 1900; - - # Return a date which is compatible with MySQL's "DATETIME" format. - return(join('-',($year,$mon,$mday)). - ' '. - join(':',($hour,$min,$sec)) - ); - } - - -# ==================================== The following two subroutines are needed -# for accommodating incorrect time formats inside the metadata. - -=pod - -B (@) - turns time metadata into seconds since epoch. - -=over 4 - -Parameters: - -=item I<%time_metadata> - a key-value listing characterizing month, year, etc. - -=back - -=over 4 - -Returns: - -=item C - seconds since epoch. - -=back - -=cut - -sub make_seconds_since_epoch (@) - { - # Keytable of time metadata. - my %time_metadata = @_; - - # Return seconds since the epoch (January 1, 1970, 00:00:00 UTC). - return(POSIX::mktime( - ($time_metadata{'seconds'}, - $time_metadata{'minutes'}, - $time_metadata{'hours'}, - $time_metadata{'day'}, - $time_metadata{'month'}-1, - $time_metadata{'year'}-1900, - 0, - 0, - $time_metadata{'dlsav'}) - ) - ); - } - -=pod - -B - if time looks sql-formatted, make it seconds since epoch. - -Somebody described this subroutine as -"retro-fixing of un-backward-compatible time format". - -What this means, is that a part of this code expects to get UTC seconds -since the epoch (beginning of 1970). Yet, some of the .meta files have -sql-formatted time strings (2001-04-01, etc.) instead of seconds-since-epoch -integers (e.g. 1044147435). These time strings do not encode the timezone -and, in this sense, can be considered "un-backwards-compatible". - -=over 4 - -Parameters: - -=item I<$potentially_badformat_string> - string to "retro-fix". - -=back - -=over 4 - -Returns: - -=item C - seconds since epoch. - -=back - -=cut - -sub sanitize_time ($) - { - my $timestamp = shift(@_); - # If timestamp is in this unexpected format.... - if ($timestamp =~ /^(\d+)\-(\d+)\-(\d+)\s+(\d+)\:(\d+)\:(\d+)$/) - { - # then convert into seconds since epoch (the expected format). - $timestamp = &make_seconds_since_epoch( - 'year' => $1, - 'month' => $2, - 'day' => $3, - 'hours' => $4, - 'minutes' => $5, - 'seconds' => $6 - ); - } - # Otherwise we assume timestamp to be as expected. - return($timestamp); - } - -=pod + localtime(&unsqltime(@_[0])); + $mon++; $year+=1900; + return "$year-$mon-$mday $hour:$min:$sec"; +} -=head1 AUTHOR +sub maketime { + my %th=@_; + return POSIX::mktime(($th{'seconds'},$th{'minutes'},$th{'hours'}, + $th{'day'},$th{'month'}-1, + $th{'year'}-1900,0,0,$th{'dlsav'})); +} -Written to help the loncapa project. -Scott Harrison, sharrison@users.sourceforge.net +######################################### +# +# Retro-fixing of un-backward-compatible time format -This is distributed under the same terms as loncapa (i.e. "freeware"). +sub unsqltime { + my $timestamp=shift; + if ($timestamp=~/^(\d+)\-(\d+)\-(\d+)\s+(\d+)\:(\d+)\:(\d+)$/) { + $timestamp=&maketime('year'=>$1,'month'=>$2,'day'=>$3, + 'hours'=>$4,'minutes'=>$5,'seconds'=>$6); + } + return $timestamp; +} -=cut