Diff for /loncom/metadata_database/searchcat.pl between versions 1.24 and 1.39

version 1.24, 2002/10/18 13:54:31 version 1.39, 2003/08/21 01:48:22
Line 26 Line 26
 #  #
 # http://www.lon-capa.org/  # http://www.lon-capa.org/
 #  #
 # YEAR=2001  
 # 04/14/2001, 04/16/2001 Scott Harrison  
 #  
 # YEAR=2002  
 # 05/11/2002 Scott Harrison  
 #  
 ###  ###
   
 # This script goes through a LON-CAPA resource  =pod
 # directory and gathers metadata.  
 # The metadata is entered into a SQL database.  =head1 NAME
   
   B<searchcat.pl> - put authoritative filesystem data into sql database.
   
   =head1 SYNOPSIS
   
   Ordinarily this script is to be called from a loncapa cron job
   (CVS source location: F<loncapa/loncom/cron/loncapa>; typical
   filesystem installation location: F</etc/cron.d/loncapa>).
   
   Here is the cron job entry.
   
   C<# Repopulate and refresh the metadata database used for the search catalog.>
   C<10 1 * * 7    www    /home/httpd/perl/searchcat.pl>
   
   This script only allows itself to be run as the user C<www>.
   
   =head1 DESCRIPTION
   
   This script goes through a loncapa resource directory and gathers metadata.
   The metadata is entered into a SQL database.
   
   This script also does general database maintenance such as reformatting
   the C<loncapa:metadata> table if it is deprecated.
   
   This script evaluates dynamic metadata from the authors'
   F<nohist_resevaldata.db> database file in order to store it in MySQL, as
   well as to compress the filesize (add up all "count"-type metadata).
   
   This script is playing an increasingly important role for a loncapa
   library server.  The proper operation of this script is critical for a smooth
   and correct user experience.
   
   =cut
   
 use lib '/home/httpd/lib/perl/';  use lib '/home/httpd/lib/perl/';
 use LONCAPA::Configuration;  use LONCAPA::Configuration;
Line 49  use POSIX qw(strftime mktime); Line 76  use POSIX qw(strftime mktime);
   
 my @metalist;  my @metalist;
   
   $simplestatus='';
   my %countext=();
   
   sub writesimple {
       open(SMP,'>/home/httpd/html/lon-status/mysql.txt');
       print SMP $simplestatus."\n";
       close(SMP);
   }
   
   sub writecount {
       open(RSMP,'>/home/httpd/html/lon-status/rescount.txt');
       foreach (keys %countext) {
    print RSMP $_.'='.$countext{$_}.'&';
       }
       print RSMP 'time='.time."\n";
       close(RSMP);
   }
   
   sub count {
       my $file=shift;
       $file=~/\.(\w+)$/;
       my $ext=lc($1);
       if (defined($countext{$ext})) {
    $countext{$ext}++;
       } else {
    $countext{$ext}=1;
       }
   }
 # ----------------------------------------------------- Un-Escape Special Chars  # ----------------------------------------------------- Un-Escape Special Chars
   
 sub unescape {  sub unescape {
Line 70  sub escape { Line 124  sub escape {
 # ------------------------------------------- Code to evaluate dynamic metadata  # ------------------------------------------- Code to evaluate dynamic metadata
   
 sub dynamicmeta {  sub dynamicmeta {
 #  
 #  
 # Do nothing for now ...  
 #  
 #  
     return;  
 #  
 # ..., but stuff below already works  
 #  
     my $url=&declutter(shift);      my $url=&declutter(shift);
     $url=~s/\.meta$//;      $url=~s/\.meta$//;
     my %returnhash=();      my %returnhash=();
     my ($adomain,$aauthor)=($url=~/^(\w+)\/(\w+)\//);      my ($adomain,$aauthor)=($url=~/^(\w+)\/(\w+)\//);
     my $prodir=&propath($adomain,$aauthor);      my $prodir=&propath($adomain,$aauthor);
     if (tie(%evaldata,'GDBM_File',      if ((tie(%evaldata,'GDBM_File',
             $prodir.'/nohist_resevaldata.db',&GDBM_WRCREAT(),0640)) {              $prodir.'/nohist_resevaldata.db',&GDBM_READER(),0640)) &&
           (tie(%newevaldata,'GDBM_File',
               $prodir.'/nohist_new_resevaldata.db',&GDBM_WRCREAT(),0640))) {
        my %sum=();         my %sum=();
        my %cnt=();         my %cnt=();
        my %listitems=('count'        => 'add',         my %listitems=('count'        => 'add',
Line 128  sub dynamicmeta { Line 176  sub dynamicmeta {
              $sum{$ctype}=''.$evaldata{$_};               $sum{$ctype}=''.$evaldata{$_};
        }         }
     }      }
     if ($ctype eq 'count') {      if ($ctype ne 'count') {
        delete($evaldata{$_});         $newevaldata{$_}=$evaldata{$_};
             }     }
  }   }
       }        }
       foreach (keys %cnt) {        foreach (keys %cnt) {
Line 144  sub dynamicmeta { Line 192  sub dynamicmeta {
      }       }
      if ($returnhash{'count'}) {       if ($returnhash{'count'}) {
          my $newkey=$$.'_'.time.'_searchcat___'.&escape($url).'___count';           my $newkey=$$.'_'.time.'_searchcat___'.&escape($url).'___count';
          $evaldata{$newkey}=$returnhash{'count'};           $newevaldata{$newkey}=$returnhash{'count'};
      }       }
      untie(%evaldata);       untie(%evaldata);
        untie(%newevaldata);
    }     }
    return %returnhash;     return %returnhash;
 }  }
Line 155  sub dynamicmeta { Line 204  sub dynamicmeta {
 require "find.pl";  require "find.pl";
 sub wanted {  sub wanted {
     (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) &&      (($dev,$ino,$mode,$nlink,$uid,$gid) = lstat($_)) &&
     -f _ &&          -f _ &&
     /^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ &&          /^.*\.meta$/ && !/^.+\.\d+\.[^\.]+\.meta$/ &&
     push(@metalist,"$dir/$_");          push(@metalist,"$dir/$_");
 }  }
   
 # ---------------  Read loncapa_apache.conf and loncapa.conf and get variables  # ---------------  Read loncapa_apache.conf and loncapa.conf and get variables
Line 169  delete $perlvar{'lonReceipt'}; # remove Line 218  delete $perlvar{'lonReceipt'}; # remove
 # ------------------------------------- Only run if machine is a library server  # ------------------------------------- Only run if machine is a library server
 exit unless $perlvar{'lonRole'} eq 'library';  exit unless $perlvar{'lonRole'} eq 'library';
   
   # ----------------------------- Make sure this process is running from user=www
   
   my $wwwid=getpwnam('www');
   if ($wwwid!=$<) {
       $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}";
       $subj="LON: $perlvar{'lonHostID'} User ID mismatch";
       system("echo 'User ID mismatch. searchcat.pl must be run as user www.' |\
    mailto $emailto -s '$subj' > /dev/null");
       exit 1;
   }
   
   
   # ---------------------------------------------------------- We are in business
   
   open(LOG,'>'.$perlvar{'lonDaemons'}.'/logs/searchcat.log');
   print LOG '==== Searchcat Run '.localtime()."====\n\n";
   $simplestatus='time='.time.'&';
 my $dbh;  my $dbh;
 # ------------------------------------- Make sure that database can be accessed  # ------------------------------------- Make sure that database can be accessed
 {  {
     unless (      unless (
     $dbh = DBI->connect("DBI:mysql:loncapa","www",$perlvar{'lonSqlAccess'},{ RaiseError =>0,PrintError=>0})      $dbh = DBI->connect("DBI:mysql:loncapa","www",$perlvar{'lonSqlAccess'},{ RaiseError =>0,PrintError=>0})
     ) {       ) { 
  print "Cannot connect to database!\n";   print LOG "Cannot connect to database!\n";
    $simplestatus.='mysql=defunct';
    &writesimple();
  exit;   exit;
     }      }
   
     my $make_metadata_table = "CREATE TABLE IF NOT EXISTS metadata (".      my $make_metadata_table = "CREATE TABLE IF NOT EXISTS metadata (".
         "title TEXT, author TEXT, subject TEXT, url TEXT, keywords TEXT, ".          "title TEXT, author TEXT, subject TEXT, url TEXT, keywords TEXT, ".
         "version TEXT, notes TEXT, abstract TEXT, mime TEXT, language TEXT, ".          "version TEXT, notes TEXT, abstract TEXT, mime TEXT, language TEXT, ".
Line 195  my $dbh; Line 264  my $dbh;
   
 # ------------------------------------------------------------- get .meta files  # ------------------------------------------------------------- get .meta files
 opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}");  opendir(RESOURCES,"$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}");
 my @homeusers=grep  my @homeusers = grep {
           {&ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_")}      &ishome("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$_")
           grep {!/^\.\.?$/} readdir(RESOURCES);      } grep {!/^\.\.?$/} readdir(RESOURCES);
 closedir RESOURCES;  closedir RESOURCES;
   
   #
   # Create the statement handlers we need
   my $delete_sth = $dbh->prepare
       ("DELETE FROM metadata WHERE url LIKE BINARY ?");
   
   my $insert_sth = $dbh->prepare
       ("INSERT INTO metadata VALUES (".
        "?,".   # title
        "?,".   # author
        "?,".   # subject
        "?,".   # m2???
        "?,".   # version
        "?,".   # current
        "?,".   # notes
        "?,".   # abstract
        "?,".   # mime
        "?,".   # language
        "?,".   # creationdate
        "?,".   # revisiondate
        "?,".   # owner
        "?)"    # copyright
        );
   
 foreach my $user (@homeusers) {  foreach my $user (@homeusers) {
       print LOG "\n=== User: ".$user."\n\n";
       # Remove left-over db-files from potentially crashed searchcat run
       my $prodir=&propath($perlvar{'lonDefDomain'},$user);
       unlink($prodir.'/nohist_new_resevaldata.db');
       # Use find.pl
       undef @metalist;
       @metalist=();
     &find("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$user");      &find("$perlvar{'lonDocRoot'}/res/$perlvar{'lonDefDomain'}/$user");
 }      # -- process each file to get metadata and put into search catalog SQL
       # database.  Also, check to see if already there.
 # -- process each file to get metadata and put into search catalog SQL database      # I could just delete (without searching first), but this works for now.
 # Also, check to see if already there.      foreach my $m (@metalist) {
 # I could just delete (without searching first), but this works for now.          print LOG "- ".$m."\n";
 foreach my $m (@metalist) {          my $ref=&metadata($m);
     my $ref=&metadata($m);          my $m2='/res/'.&declutter($m);
     my $m2='/res/'.&declutter($m);          $m2=~s/\.meta$//;
     $m2=~s/\.meta$//;  #        &dynamicmeta($m2);
     &dynamicmeta($m2);   &count($m2);
     my $q2="select * from metadata where url like binary '$m2'";          $delete_sth->execute($m2);
     my $sth = $dbh->prepare($q2);          $insert_sth->execute($ref->{'title'},
     $sth->execute();                               $ref->{'author'},
     my $r1=$sth->fetchall_arrayref;                               $ref->{'subject'},
     if (@$r1) {                               $m2,
  $sth=$dbh->prepare("delete from metadata where url like binary '$m2'");                               $ref->{'keywords'},
         $sth->execute();                               'current',
                                $ref->{'notes'},
                                $ref->{'abstract'},
                                $ref->{'mime'},
                                $ref->{'language'},
                                sqltime($ref->{'creationdate'}),
                                sqltime($ref->{'lastrevisiondate'}),
                                $ref->{'owner'},
                                $ref->{'copyright'});
   #        if ($dbh->err()) {
   #            print STDERR "Error:".$dbh->errstr()."\n";
   #        }
           $ref = undef;
     }      }
     $sth=$dbh->prepare('insert into metadata values ('.      
   '"'.delete($ref->{'title'}).'"'.','.      # --------------------------------------------------- Clean up database
   '"'.delete($ref->{'author'}).'"'.','.      # Need to, perhaps, remove stale SQL database records.
   '"'.delete($ref->{'subject'}).'"'.','.      # ... not yet implemented
   '"'.$m2.'"'.','.          
   '"'.delete($ref->{'keywords'}).'"'.','.      # ------------------------------------------- Copy over the new db-files
   '"'.'current'.'"'.','.      #
   '"'.delete($ref->{'notes'}).'"'.','.      # Check the size of nohist_new_resevaldata.db compared to 
   '"'.delete($ref->{'abstract'}).'"'.','.      # nohist_resevaldata.db
   '"'.delete($ref->{'mime'}).'"'.','.  #    my @stat_result = stat($prodir.'/nohist_new_resevaldata.db');
   '"'.delete($ref->{'language'}).'"'.','.  #    my $new_size = $stat_result[7];
   '"'.sqltime(delete($ref->{'creationdate'})).'"'.','.  #    @stat_result = stat($prodir.'/nohist_resevaldata.db');
   '"'.sqltime(delete($ref->{'lastrevisiondate'})).'"'.','.  #    my $old_size = $stat_result[7];
   '"'.delete($ref->{'owner'}).'"'.','.  #    if ($old_size) {
   '"'.delete($ref->{'copyright'}).'"'.')');  # if ($new_size/$old_size > 0.15 ) {
     $sth->execute();  #    system('mv '.$prodir.'/nohist_new_resevaldata.db '.
 }  #   $prodir.'/nohist_resevaldata.db');
   # } else {
 # ----------------------------------------------------------- Clean up database  #    print LOG "Size of '$user' old nohist_reseval: $old_size ".
 # Need to, perhaps, remove stale SQL database records.  # "Size of new: $new_size.  Not overwriting.\n";
 # ... not yet implemented  #    my $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}";
   #    my $subj="LON: $perlvar{'lonHostID'} searchcat.pl $user reseval ".
   # "modification error.";
   #    system("echo ".
   # "'See /home/httpd/perl/logs/searchcat.txt for information.' ".
   #   "| mailto $emailto -s '$subj' > /dev/null");
   # }
   #    }   
   }
 # --------------------------------------------------- Close database connection  # --------------------------------------------------- Close database connection
 $dbh->disconnect;  $dbh->disconnect;
   print LOG "\n==== Searchcat completed ".localtime()." ====\n";
   close(LOG);
   &writesimple();
   &writecount();
   exit 0;
   
   
   
   # =============================================================================
   
 # ---------------------------------------------------------------- Get metadata  # ---------------------------------------------------------------- Get metadata
 # significantly altered from subroutine present in lonnet  # significantly altered from subroutine present in lonnet
Line 259  sub metadata { Line 387  sub metadata {
         my $parser=HTML::TokeParser->new(\$metastring);          my $parser=HTML::TokeParser->new(\$metastring);
         my $token;          my $token;
         while ($token=$parser->get_token) {          while ($token=$parser->get_token) {
            if ($token->[0] eq 'S') {              if ($token->[0] eq 'S') {
       my $entry=$token->[1];                  my $entry=$token->[1];
               my $unikey=$entry;                  my $unikey=$entry;
               if (defined($token->[2]->{'part'})) {                   if (defined($token->[2]->{'part'})) { 
                  $unikey.='_'.$token->[2]->{'part'};                       $unikey.='_'.$token->[2]->{'part'}; 
       }                  }
               if (defined($token->[2]->{'name'})) {                   if (defined($token->[2]->{'name'})) { 
                  $unikey.='_'.$token->[2]->{'name'};                       $unikey.='_'.$token->[2]->{'name'}; 
       }                  }
               if ($metacache{$uri.'keys'}) {                  if ($metacache{$uri.'keys'}) {
                  $metacache{$uri.'keys'}.=','.$unikey;                      $metacache{$uri.'keys'}.=','.$unikey;
               } else {                  } else {
                  $metacache{$uri.'keys'}=$unikey;                      $metacache{$uri.'keys'}=$unikey;
       }                  }
               map {                  map {
   $metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_};                      $metacache{$uri.''.$unikey.'.'.$_}=$token->[2]->{$_};
               } @{$token->[3]};                  } @{$token->[3]};
               unless (                  unless (
                  $metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry)                          $metacache{$uri.''.$unikey}=$parser->get_text('/'.$entry)
       ) { $metacache{$uri.''.$unikey}=                          ) { $metacache{$uri.''.$unikey}=
       $metacache{$uri.''.$unikey.'.default'};                                  $metacache{$uri.''.$unikey.'.default'};
       }                          }
           }              }
        }          }
     }      }
     return \%metacache;      return \%metacache;
 }  }
Line 290  sub metadata { Line 418  sub metadata {
 # ------------------------------------------------------------ Serves up a file  # ------------------------------------------------------------ Serves up a file
 # returns either the contents of the file or a -1  # returns either the contents of the file or a -1
 sub getfile {  sub getfile {
   my $file=shift;      my $file=shift;
   if (! -e $file ) { return -1; };      if (! -e $file ) { return -1; };
   my $fh=IO::File->new($file);      my $fh=IO::File->new($file);
   my $a='';      my $a='';
   while (<$fh>) { $a .=$_; }      while (<$fh>) { $a .=$_; }
   return $a      return $a;
 }  }
   
 # ------------------------------------------------------------- Declutters URLs  # ------------------------------------------------------------- Declutters URLs
Line 343  sub sqltime { Line 471  sub sqltime {
   
 sub maketime {  sub maketime {
     my %th=@_;      my %th=@_;
     return POSIX::mktime(      return POSIX::mktime(($th{'seconds'},$th{'minutes'},$th{'hours'},
         ($th{'seconds'},$th{'minutes'},$th{'hours'},                            $th{'day'},$th{'month'}-1,
          $th{'day'},$th{'month'}-1,$th{'year'}-1900,0,0,$th{'dlsav'}));                            $th{'year'}-1900,0,0,$th{'dlsav'}));
 }  }
   
   
Line 356  sub maketime { Line 484  sub maketime {
 sub unsqltime {  sub unsqltime {
     my $timestamp=shift;      my $timestamp=shift;
     if ($timestamp=~/^(\d+)\-(\d+)\-(\d+)\s+(\d+)\:(\d+)\:(\d+)$/) {      if ($timestamp=~/^(\d+)\-(\d+)\-(\d+)\s+(\d+)\:(\d+)\:(\d+)$/) {
        $timestamp=&maketime(          $timestamp=&maketime('year'=>$1,'month'=>$2,'day'=>$3,
    'year'=>$1,'month'=>$2,'day'=>$3,                               'hours'=>$4,'minutes'=>$5,'seconds'=>$6);
            'hours'=>$4,'minutes'=>$5,'seconds'=>$6);  
     }      }
     return $timestamp;      return $timestamp;
 }  }

Removed from v.1.24  
changed lines
  Added in v.1.39


FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>