From 876a2e5d82274a28bc62de14e79359fb366a89b3 Mon Sep 17 00:00:00 2001 From: "Brian \"Moses\" Hall" Date: Tue, 3 Mar 2026 11:38:18 -0500 Subject: [PATCH 01/15] ETT-1288 Run fixity check on new storage - Add `storage_name` column to `feed_audit` and `feed_audit_detail` - `main_repo_audit.pl` now requires a `--storage_name` parameter - Expected values `s3-truenas-ictc` and `s3-truenas-macc` - `HTFeed/Storage/LocalPairtree.pm` and `HTFeed/StorageAudit.pm` also write `feed_audit.storage_namd` and `feed_audit_detail.storage_name`, respectivelt, but I'm not sure what values that can or should provide - Add a couple of happy path tests for the script as a whole. --- bin/audit/main_repo_audit.pl | 154 ++++++++++++++++------------ etc/ingest.sql | 6 +- lib/HTFeed/Storage/LocalPairtree.pm | 9 +- lib/HTFeed/StorageAudit.pm | 6 +- t/main_repo_audit.t | 83 +++++++++++++++ 5 files changed, 183 insertions(+), 75 deletions(-) create mode 100644 t/main_repo_audit.t diff --git a/bin/audit/main_repo_audit.pl b/bin/audit/main_repo_audit.pl index b795a51a..5f308877 100755 --- a/bin/audit/main_repo_audit.pl +++ b/bin/audit/main_repo_audit.pl @@ -21,22 +21,22 @@ use Getopt::Long; use URI::Escape; -my $tombstone_check = "select is_tombstoned from feed_audit where namespace = ? and id = ?"; +my $tombstone_check = "select is_tombstoned from feed_audit where namespace = ? and id = ? and storage_name = ?"; my $insert = -"insert into feed_audit (namespace, id, sdr_partition, zip_size, zip_date, mets_size, mets_date, lastchecked) values(?,?,?,?,?,?,?,CURRENT_TIMESTAMP) \ +"insert into feed_audit (namespace, id, storage_name, sdr_partition, zip_size, zip_date, mets_size, mets_date, lastchecked) values(?,?,?,?,?,?,?,?,CURRENT_TIMESTAMP) \ ON DUPLICATE KEY UPDATE sdr_partition = ?, zip_size=?, zip_date =?,mets_size=?,mets_date=?,lastchecked = CURRENT_TIMESTAMP"; my $update = -"update feed_audit set md5check_ok = ?, lastmd5check = CURRENT_TIMESTAMP where namespace = ? and id = ?"; +"update feed_audit set md5check_ok = ?, lastmd5check = CURRENT_TIMESTAMP where namespace = ? and id = ? and storage_name = ?"; my $update_mets = -"update feed_audit set page_count = ?, image_size = ? where namespace = ? and id = ?"; +"update feed_audit set page_count = ?, image_size = ? where namespace = ? and id = ? and storage_name = ?"; my $insert_detail = -"insert into feed_audit_detail (namespace, id, path, status, detail) values (?,?,?,?,?)"; +"insert into feed_audit_detail (namespace, id, storage_name, path, status, detail) values (?,?,?,?,?,?)"; my $checkpoint_sel = -"select lastmd5check > ? from feed_audit where namespace = ? and id = ?"; +"select lastmd5check > ? from feed_audit where namespace = ? and id = ? and storage_name = ?"; ### set /sdr1 to /sdrX for test & parallelization my $filesProcessed = 0; @@ -44,15 +44,28 @@ my $do_md5 = 0; my $do_mets = 0; my $checkpoint = undef; +my $storage_name = undef; GetOptions( 'md5!' => \$do_md5, -'mets!' => \$do_mets, - 'checkpoint=s' => \$checkpoint, + 'mets!' => \$do_mets, + 'checkpoint=s' => \$checkpoint, + 'storage_name=s' => \$storage_name, ); +# The --sdr_partition flag is purely for testing, or whenever the partition cannot +# be inferred from the path argument. Pass a bare integer. + +# $storage_name must be one of 's3-truenas-ictc', 's3-truenas-macc' +if (!defined $storage_name) { + die '--storage_name is required'; +} +if ($storage_name ne 's3-truenas-macc' && $storage_name ne 's3-truenas-ictc') { + die "--storage_name must have value of 's3-truenas-macc' or 's3-truenas-ictc"; +} + my $base = shift @ARGV or die("Missing base directory.."); -my ($sdr_partition) = ($base =~ qr#/?sdr(\d+)/?#); +my ($sdr_partition) = ($base =~ qr#sdr(\d+)/?#); open( RUN, "find $base -follow -type f|" ) or die("Can't open pipe to find: $!"); @@ -75,36 +88,38 @@ # print "$filesProcessed files processed\n"; # } - - # strip trailing / from path - my ( $pt_objid, $path, $type ) = - fileparse( $line, qr/\.mets\.xml/, qr/\.zip/ ); + my ($pt_objid, $path, $type) = fileparse( $line, qr/\.mets\.xml/, qr/\.zip/ ); $path =~ s/\/$//; # remove trailing / + return if ( $prevpath and $path eq $prevpath ); + # check mtime on directory - do not check if mtime is in the past two days # to let synciq catch up - return if recently_modified_path($path); + # Removed for ETT-1288 + #return if recently_modified_path($path); $prevpath = $path; - my @pathcomp = split( "/", $path ); + # For testing, remove everything up to and including the `sdrX/` + my $subpath = $path; + $subpath =~ s!.*?sdr\d+/!!; + my @pathcomp = split( "/", $subpath ); # remove base & any empty components @pathcomp = grep { $_ ne '' } @pathcomp; - my $first_path = shift @pathcomp; my $last_path = pop @pathcomp; my $namespace = $pathcomp[1]; my $objid = ppath2id( join( "/", @pathcomp ) ); if ( $pt_objid ne s2ppchars($objid) ) { - set_status( $namespace, $objid, $path, "BAD_PAIRTREE", + set_status( $namespace, $objid, $storage_name, $path, "BAD_PAIRTREE", "$objid $pt_objid" ); } if ( $last_path ne $pt_objid ) { - set_status( $namespace, $objid, $path, "BAD_PAIRTREE", + set_status( $namespace, $objid, $storage_name, $path, "BAD_PAIRTREE", "$last_path $pt_objid" ); } @@ -137,14 +152,14 @@ $last_touched = $mets_seconds if defined $mets_seconds and (not defined $zip_seconds or $mets_seconds > $zip_seconds); #test symlinks unless we're traversing sdr1 or the file is too new - if ( $first_path ne 'sdr1' and (defined $last_touched and time - $last_touched >= 86400) ) { + if ( $sdr_partition != 1 and (defined $last_touched and time - $last_touched >= 86400) ) { my $link_path = join( "/", "/sdr1", @pathcomp, $last_path ); my $link_target = readlink $link_path - or set_status( $namespace, $objid, $path, "CANT_LSTAT", + or set_status( $namespace, $objid, $storage_name, $path, "CANT_LSTAT", "$link_path $!" ); if ( defined $link_target and $link_target ne $path ) { - set_status( $namespace, $objid, $path, "SYMLINK_INVALID", + set_status( $namespace, $objid, $storage_name, $path, "SYMLINK_INVALID", $link_target ); } @@ -154,7 +169,7 @@ execute_stmt( $insert, - $namespace, $objid, + $namespace, $objid, $storage_name, $sdr_partition, $zipsize, $zipdate, $metssize, $metsdate, @@ -183,7 +198,7 @@ $found_zip++ if $ext eq 'zip'; $found_mets++ if $ext eq 'mets.xml'; if ( $pt_objid ne $dir_barcode ) { - set_status( $namespace, $objid, $path, "BARCODE_MISMATCH", + set_status( $namespace, $objid, $storage_name, $path, "BARCODE_MISMATCH", "$pt_objid $dir_barcode" ); } $filecount++; @@ -191,27 +206,28 @@ closedir($dh); -# check file count; do md5 check and METS extraction stuff, but only if it's fully replicated - if ( ( defined $zip_seconds and time - $zip_seconds > 86400 ) - or ( defined $mets_seconds and time - $mets_seconds > 86400 ) ) + # Removed for ETT-1288 + # check file count; do md5 check and METS extraction stuff, but only if it's fully replicated + #if ( ( defined $zip_seconds and time - $zip_seconds > 86400 ) + # or ( defined $mets_seconds and time - $mets_seconds > 86400 ) ) { - if ( $filecount > 2 or $filecount < 1 or ($found_zip != 1 and not is_tombstoned($namespace,$objid) ) or $found_mets != 1 ) { - set_status( $namespace, $objid, $path, "BAD_FILECOUNT", + if ( $filecount > 2 or $filecount < 1 or ($found_zip != 1 and not is_tombstoned($namespace,$objid,$storage_name) ) or $found_mets != 1 ) { + set_status( $namespace, $objid, $storage_name, $path, "BAD_FILECOUNT", "zip=$found_zip mets=$found_mets total=$filecount" ); } eval { - my $rval = zipcheck( $namespace, $objid ); + my $rval = zipcheck( $namespace, $objid, $storage_name ); if ($rval) { - execute_stmt( $update, "1", $namespace, $objid ); + execute_stmt( $update, "1", $namespace, $objid, $storage_name ); } elsif ( defined $rval ) { - execute_stmt( $update, "0", $namespace, $objid ); + execute_stmt( $update, "0", $namespace, $objid, $storage_name ); } }; if ($@) { - set_status( $namespace, $objid, $path, "CANT_ZIPCHECK", $@ ); + set_status( $namespace, $objid, $storage_name, $path, "CANT_ZIPCHECK", $@ ); } } @@ -223,15 +239,15 @@ } sub zipcheck { - my ( $namespace, $objid ) = @_; + my ( $namespace, $objid, $storage_name ) = @_; return unless $do_md5 or $do_mets; - return if is_tombstoned($namespace, $objid); + return if is_tombstoned($namespace, $objid, $storage_name); # don't check this item if we just looked at it if(defined $checkpoint) { - my $sth = execute_stmt($checkpoint_sel,$checkpoint,$namespace,$objid); + my $sth = execute_stmt($checkpoint_sel,$checkpoint,$namespace,$objid,$storage_name); if(my @row = $sth->fetchrow_array()) { return if @row and $row[0]; } @@ -266,7 +282,8 @@ sub zipcheck { } if ( not defined $mets_zipsum or length($mets_zipsum) ne 32 ) { - set_status( $namespace, $objid, $volume->get_repository_mets_path(), + set_status( $namespace, $objid, $storage_name, + $volume->get_repository_mets_path(), "MISSING_METS_CHECKSUM", undef ); } else { @@ -277,7 +294,7 @@ sub zipcheck { $rval = 1; } else { - set_status( $namespace, $objid, + set_status( $namespace, $objid, $storage_name, $volume->get_repository_zip_path(), "BAD_CHECKSUM", "expected=$mets_zipsum actual=$realsum" ); $rval = 0; @@ -297,7 +314,7 @@ sub zipcheck { $filetypes{$extension}++; } while ( my ( $ext, $count ) = each(%filetypes) ) { - mets_log( $namespace, $objid, "FILETYPE", $ext, $count ); + mets_log( $namespace, $objid, $storage_name, "FILETYPE", $ext, $count ); } } @@ -310,7 +327,7 @@ sub zipcheck { $premisversion = "premis2"; } - mets_log( $namespace, $objid, "PREMIS_VERSION", $premisversion ); + mets_log( $namespace, $objid, $storage_name, "PREMIS_VERSION", $premisversion ); } { # PREMIS event ID types @@ -325,7 +342,7 @@ sub zipcheck { $event_id_types{ $mets->findvalue( '.', $eventtype ) }++; } foreach my $event_id_type ( keys(%event_id_types) ) { - mets_log( $namespace, $objid, "PREMIS_EVENT_TYPE", + mets_log( $namespace, $objid, $storage_name, "PREMIS_EVENT_TYPE", $event_id_type, $event_id_types{$event_id_type} ); } } @@ -341,7 +358,7 @@ sub zipcheck { $agent_id_types{ $mets->findvalue( '.', $agenttype ) }++; } foreach my $agent_id_type ( keys(%agent_id_types) ) { - mets_log( $namespace, $objid, "PREMIS_AGENT_TYPE", + mets_log( $namespace, $objid, $storage_name, "PREMIS_AGENT_TYPE", $agent_id_type, $agent_id_types{$agent_id_type} ); } @@ -361,7 +378,7 @@ sub zipcheck { my $date = $mets->findvalue( './premis:eventDateTime', $event ); - mets_log( $namespace, $objid, "CAPTURE", $executor, $date ); + mets_log( $namespace, $objid, $storage_name, "CAPTURE", $executor, $date ); } } { # Processing agent @@ -378,7 +395,7 @@ sub zipcheck { my $date = $mets->findvalue( './premis:eventDateTime', $event ); - mets_log( $namespace, $objid, "MD5SUM", $executor, $date ); + mets_log( $namespace, $objid, $storage_name, "MD5SUM", $executor, $date ); } } @@ -392,14 +409,14 @@ sub zipcheck { my $date = $mets->findvalue( './premis:eventDateTime', $event ); - mets_log( $namespace, $objid, "INGEST", $date ); + mets_log( $namespace, $objid, $storage_name, "INGEST", $date ); } } { # MARC present my $marc_present = $mets->findvalue('count(//marc:record | //record)'); - mets_log( $namespace, $objid, "MARC", $marc_present ); + mets_log( $namespace, $objid, $storage_name, "MARC", $marc_present ); } { # METS valid @@ -410,7 +427,7 @@ sub zipcheck { $error =~ s/\n/ /mg; } - mets_log( $namespace, $objid, "METS_VALID", $mets_valid, $error ); + mets_log( $namespace, $objid, $storage_name, "METS_VALID", $mets_valid, $error ); } { @@ -427,7 +444,7 @@ sub zipcheck { push( @mdbits, "$attr=$attrval" ); } } - mets_log( $namespace, $objid, "METS_MDSEC", + mets_log( $namespace, $objid, $storage_name, "METS_MDSEC", join( "; ", @mdbits ) ); } } @@ -436,15 +453,15 @@ sub zipcheck { { # Page tagging, image size my $has_pagetags = $mets->findvalue( 'count(//mets:div[@TYPE="page"]/@LABEL[string() != ""])'); - mets_log( $namespace, $objid, "PAGETAGS", $has_pagetags ); + mets_log( $namespace, $objid, $storage_name, "PAGETAGS", $has_pagetags ); my $pages = $mets->findvalue('count(//mets:div[@TYPE="page"])'); - mets_log( $namespace, $objid, "PAGES", $pages ); + mets_log( $namespace, $objid, $storage_name, "PAGES", $pages ); my $image_size = $mets->findvalue('sum(//mets:fileGrp[@USE="image"]/mets:file/@SIZE)'); - mets_log( $namespace, $objid, "IMAGE_SIZE", $image_size); + mets_log( $namespace, $objid, $storage_name, "IMAGE_SIZE", $image_size); - execute_stmt($update_mets,$pages,$image_size,$namespace,$objid); + execute_stmt($update_mets,$pages,$image_size,$namespace,$objid,$storage_name); } @@ -486,16 +503,16 @@ sub extract_source_mets { } } if ( !@srcmets ) { - set_status( $namespace, $objid, $zipfile, "NO_SOURCE_METS", undef ); + set_status( $namespace, $objid, $storage_name, $zipfile, "NO_SOURCE_METS", undef ); } elsif ( @srcmets != 1 ) { - set_status( $namespace, $objid, $zipfile, + set_status( $namespace, $objid, $storage_name, $zipfile, "MULTIPLE_SOURCE_METS_CANDIDATES", undef ); } else { # source METS found - mets_log( $namespace, $objid, "SOURCE_METS", $srcmets[0] ); + mets_log( $namespace, $objid, $storage_name, "SOURCE_METS", $srcmets[0] ); system("cd /tmp; unzip -j '$zipfile' '$srcmets[0]'"); my ($smets_name) = ( $srcmets[0] =~ /\/([^\/]+)$/ ); my $tmp_smets_loc = "/tmp/$smets_name"; @@ -515,13 +532,13 @@ sub extract_source_mets { $mdsecs{ join( '; ', @mdbits ) } = 1; } foreach my $mdsec ( sort( keys(%mdsecs) ) ) { - mets_log( $namespace, $objid, "SRC_METS_MDSEC", $mdsec ); + mets_log( $namespace, $objid, $storage_name, "SRC_METS_MDSEC", $mdsec ); } # Try to get Google reading order foreach my $tag (qw(gbs:pageOrder gbs:pageSequence gbs:coverTag)) { my $val = $xpc->findvalue("//$tag"); - mets_log( $namespace, $objid, "GBS_READING", $tag, $val ); + mets_log( $namespace, $objid, $storage_name, "GBS_READING", $tag, $val ); } foreach my $techmd ( $xpc->findnodes("//mets:techMD") ) { @@ -532,14 +549,14 @@ sub extract_source_mets { my $count = $xpc->findvalue( "count(//mets:file[contains(\@ADMID,\"$imagemethod_id\")])" ); - mets_log( $namespace, $objid, "IMAGE_METHOD", $method, + mets_log( $namespace, $objid, $storage_name, "IMAGE_METHOD", $method, $count ); } } }; if ($@) { - set_status( $namespace, $objid, $srcmets[0], "BAD_SOURCE_METS", + set_status( $namespace, $objid, $storage_name, $srcmets[0], "BAD_SOURCE_METS", $@ ); } @@ -556,7 +573,7 @@ sub mets_log { my $val2 = shift; $val1 = '' if not defined $val1; $val2 = '' if not defined $val2; - print join( "\t", $namespace, $objid, $key, $val1, $val2 ), "\n"; + print join( "\t", $namespace, $objid, $storage_name, $key, $val1, $val2 ), "\n"; #execute_stmt($fs_mets_data,$namespace,$objid,$key,$val1,$val2); } @@ -564,7 +581,7 @@ sub mets_log { sub is_tombstoned { my $namespace = shift; my $objid = shift; - my $sth = execute_stmt($tombstone_check,$namespace,$objid); + my $sth = execute_stmt($tombstone_check,$namespace,$objid,$storage_name); if(my @row = $sth->fetchrow_array()) { return $row[0]; } else { @@ -572,14 +589,15 @@ sub is_tombstoned { } } -sub recently_modified_path { - my $path = shift; - - my $mtime = ( stat($path) )[9]; - my $mtime_age = time() - $mtime; - - return 1 if $mtime_age < (86400 * 2); -} +# Removed for ETT-1288 +# sub recently_modified_path { +# my $path = shift; +# +# my $mtime = ( stat($path) )[9]; +# my $mtime_age = time() - $mtime; +# +# return 1 if $mtime_age < (86400 * 2); +# } sub recent_previous_version { my $file = shift; diff --git a/etc/ingest.sql b/etc/ingest.sql index 17e95127..6551e41e 100644 --- a/etc/ingest.sql +++ b/etc/ingest.sql @@ -3,6 +3,7 @@ USE `ht`; CREATE TABLE IF NOT EXISTS `feed_audit` ( `namespace` varchar(10) NOT NULL, `id` varchar(30) NOT NULL, + `storage_name` varchar(32) NOT NULL, `sdr_partition` tinyint(4) DEFAULT NULL, `zip_size` bigint(20) DEFAULT NULL, `image_size` bigint(20) DEFAULT NULL, @@ -14,7 +15,7 @@ CREATE TABLE IF NOT EXISTS `feed_audit` ( `lastmd5check` timestamp NULL DEFAULT NULL, `md5check_ok` tinyint(1) DEFAULT NULL, `is_tombstoned` tinyint(1) DEFAULT NULL, - PRIMARY KEY (`namespace`,`id`), + PRIMARY KEY (`namespace`,`id`,`storage_name`), KEY `feed_audit_zip_date_idx` (`zip_date`) ); @@ -144,11 +145,12 @@ CREATE TABLE IF NOT EXISTS `feed_storage` ( CREATE TABLE IF NOT EXISTS `feed_audit_detail` ( `namespace` varchar(10) NOT NULL, `id` varchar(30) NOT NULL, + `storage_name` varchar(32) NOT NULL, `path` varchar(255) DEFAULT NULL, `status` varchar(30) DEFAULT NULL, `detail` tinytext, `time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, - KEY `fs_log_status_objid_idx` (`namespace`,`id`) + KEY `fs_log_status_objid_idx` (`namespace`,`id`,`storage_name`) ); USE `ht`; diff --git a/lib/HTFeed/Storage/LocalPairtree.pm b/lib/HTFeed/Storage/LocalPairtree.pm index 7308fcda..3e55646b 100644 --- a/lib/HTFeed/Storage/LocalPairtree.pm +++ b/lib/HTFeed/Storage/LocalPairtree.pm @@ -138,12 +138,16 @@ sub record_audit { my ($sdr_partition) = ($path =~ qr#/?sdr(\d+)/?#); my $stmt = - "insert into feed_audit (namespace, id, sdr_partition, zip_size, zip_date, mets_size, mets_date, lastchecked, lastmd5check, md5check_ok) \ - values(?,?,?,?,?,?,?,CURRENT_TIMESTAMP,CURRENT_TIMESTAMP,1) \ + "insert into feed_audit (namespace, id, storage_name, sdr_partition, zip_size, zip_date, mets_size, mets_date, lastchecked, lastmd5check, md5check_ok) \ + values(?,?,?,?,?,?,?,?,CURRENT_TIMESTAMP,CURRENT_TIMESTAMP,1) \ ON DUPLICATE KEY UPDATE sdr_partition = ?, zip_size=?, zip_date =?,mets_size=?,mets_date=?,lastchecked = CURRENT_TIMESTAMP,lastmd5check = CURRENT_TIMESTAMP, md5check_ok = 1"; # TODO populate image_size, page_count + # FIXME: is this right?? should we force it to one of {s3-truenas-ictc, s3-truenas-macc}? + my $storage_name = $self->{storage_name} || 'LocalPairtree'; + + my $zipsize = $self->zip_size; my $zipdate = $self->file_date($self->zip_obj_path); my $metssize = $self->mets_size; @@ -151,6 +155,7 @@ sub record_audit { my $sth = get_dbh()->prepare($stmt); my $res = $sth->execute( $self->{namespace}, $self->{objid}, + $storage_name, $sdr_partition, $zipsize, $zipdate, $metssize, $metsdate, # duplicate parameters for duplicate key update $sdr_partition, $zipsize, $zipdate, $metssize, $metsdate diff --git a/lib/HTFeed/StorageAudit.pm b/lib/HTFeed/StorageAudit.pm index 1bb9f48f..a5a8db21 100644 --- a/lib/HTFeed/StorageAudit.pm +++ b/lib/HTFeed/StorageAudit.pm @@ -397,9 +397,9 @@ sub record_error { my $status = shift @$err; my %details = @$err; my $detail = join "\t", map { "$_: $details{$_}"; } keys %details; - my $sql = 'INSERT INTO feed_audit_detail (namespace, id, path, status, detail)'. - ' VALUES (?,?,?,?,?)'; - execute_stmt($sql, $obj->{namespace}, $obj->{objid}, $obj->{path}, $status, $detail); + my $sql = 'INSERT INTO feed_audit_detail (namespace, id, storage_name, path, status, detail)'. + ' VALUES (?,?,?,?,?,?)'; + execute_stmt($sql, $obj->{namespace}, $obj->{objid}, $self->{storage_name}, $obj->{path}, $status, $detail); } # ==== UTILITY CLASS METHOD ==== diff --git a/t/main_repo_audit.t b/t/main_repo_audit.t new file mode 100644 index 00000000..5dfbe023 --- /dev/null +++ b/t/main_repo_audit.t @@ -0,0 +1,83 @@ +use Test::Spec; +use HTFeed::DBTools qw(get_dbh); +use HTFeed::Storage::LocalPairtree; +use Capture::Tiny; +use File::Copy; +use File::Pairtree qw(id2ppath s2ppchars); +use File::Path; + +use strict; + +describe "bin/audit/main_repo_audit.pl" => sub { + spec_helper 'storage_helper.pl'; + local our ($tmpdirs, $testlog); + + sub local_storage { + my $volume = stage_volume($tmpdirs,@_); + + my $storage = HTFeed::Storage::LocalPairtree->new( + name => 'localpairtree-test', + volume => $volume, + config => { + obj_dir => $tmpdirs->{obj_dir} + } + ); + return $storage; + } + + sub count_feed_audit_entries { + my $namespace = shift; + my $objid = shift; + my $storage_name = shift; + my $sdr_partition = shift; + + my $sql = 'SELECT COUNT(*) FROM feed_audit WHERE namespace=? AND id=? AND storage_name=? AND sdr_partition=?'; + my $sth = get_dbh()->prepare($sql); + $sth->execute($namespace, $objid, $storage_name, $sdr_partition); + if (my @row = $sth->fetchrow_array()) { + return $row[0]; + } else { + return 0; + } + } + + before each => sub { + my $namespace = 'test'; + my $objid = 'test'; + my $storage = local_storage($namespace, $objid); + $storage->stage; + $storage->make_object_path; + $storage->move; + my $pt_objid = s2ppchars($objid); + my $pt_path = id2ppath($objid); + # main_repo_audit.pl can infer its sdr partition when it isn't at the root of the + # filesystem but the `tmpdirs` random names will throw it off completely. Hence we + # copy to a location where we can put "sdr1" in the path. + File::Path::make_path('/tmp/sdr1/obj'); + `cp -r $tmpdirs->{obj_dir}/* /tmp/sdr1/obj/`; + # This is just conforming to `etc/config_test.yml` so Volume.pm can find the files. + File::Path::make_path("/tmp/obj_link/test/$pt_path"); + `ln -s /tmp/sdr1/obj/test/$pt_path/$pt_objid /tmp/obj_link/test/$pt_path`; + }; + + after each => sub { + File::Path::remove_tree('/tmp/sdr1'); + File::Path::remove_tree('/tmp/obj_link'); + }; + + describe 'at macc' => sub { + it "succeeds" => sub { + `bin/audit/main_repo_audit.pl --md5 --storage_name s3-truenas-macc /tmp/sdr1`; + is(count_feed_audit_entries('test', 'test', 's3-truenas-macc', 1), 1, 'one feed_audit entry'); + }; + }; + + describe 'at ictc' => sub { + it "succeeds" => sub { + `bin/audit/main_repo_audit.pl --md5 --storage_name s3-truenas-ictc /tmp/sdr1`; + is(count_feed_audit_entries('test', 'test', 's3-truenas-ictc', 1), 1, 'one feed_audit entry'); + }; + }; +}; + +runtests unless caller; From 412d376b6547d3c5a36b94305ebf48d79c2c513c Mon Sep 17 00:00:00 2001 From: "Brian \"Moses\" Hall" Date: Tue, 3 Mar 2026 12:10:08 -0500 Subject: [PATCH 02/15] Remove stale comment --- bin/audit/main_repo_audit.pl | 2 -- 1 file changed, 2 deletions(-) diff --git a/bin/audit/main_repo_audit.pl b/bin/audit/main_repo_audit.pl index 5f308877..3d017f14 100755 --- a/bin/audit/main_repo_audit.pl +++ b/bin/audit/main_repo_audit.pl @@ -52,8 +52,6 @@ 'storage_name=s' => \$storage_name, ); -# The --sdr_partition flag is purely for testing, or whenever the partition cannot -# be inferred from the path argument. Pass a bare integer. # $storage_name must be one of 's3-truenas-ictc', 's3-truenas-macc' if (!defined $storage_name) { From d7770dc376b991c3ed1bf1990916548fa9d09e51 Mon Sep 17 00:00:00 2001 From: "Brian \"Moses\" Hall" Date: Tue, 3 Mar 2026 12:24:21 -0500 Subject: [PATCH 03/15] Restore unnecesarily changed regex for sdr_partition --- bin/audit/main_repo_audit.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/audit/main_repo_audit.pl b/bin/audit/main_repo_audit.pl index 3d017f14..05b235f5 100755 --- a/bin/audit/main_repo_audit.pl +++ b/bin/audit/main_repo_audit.pl @@ -63,7 +63,7 @@ my $base = shift @ARGV or die("Missing base directory.."); -my ($sdr_partition) = ($base =~ qr#sdr(\d+)/?#); +my ($sdr_partition) = ($base =~ qr#/?sdr(\d+)/?#); open( RUN, "find $base -follow -type f|" ) or die("Can't open pipe to find: $!"); From 318cbc68b28db9c2329a958a820679c5fc6b2ddb Mon Sep 17 00:00:00 2001 From: "Brian \"Moses\" Hall" Date: Tue, 3 Mar 2026 12:28:02 -0500 Subject: [PATCH 04/15] Remove unused includes in t/main_repo_audit.t --- t/main_repo_audit.t | 2 -- 1 file changed, 2 deletions(-) diff --git a/t/main_repo_audit.t b/t/main_repo_audit.t index 5dfbe023..418e72ed 100644 --- a/t/main_repo_audit.t +++ b/t/main_repo_audit.t @@ -1,10 +1,8 @@ use Test::Spec; use HTFeed::DBTools qw(get_dbh); use HTFeed::Storage::LocalPairtree; -use Capture::Tiny; use File::Copy; use File::Pairtree qw(id2ppath s2ppchars); -use File::Path; use strict; From cf920f675c2502fb7b08fd6738a324403a9e8171 Mon Sep 17 00:00:00 2001 From: "Brian \"Moses\" Hall" Date: Fri, 6 Mar 2026 12:01:24 -0500 Subject: [PATCH 05/15] Add RepositoryIterator class --- lib/HTFeed/RepositoryIterator.pm | 137 +++++++++++++++++++++++++++++++ t/repository_iterator.t | 90 ++++++++++++++++++++ 2 files changed, 227 insertions(+) create mode 100644 lib/HTFeed/RepositoryIterator.pm create mode 100644 t/repository_iterator.t diff --git a/lib/HTFeed/RepositoryIterator.pm b/lib/HTFeed/RepositoryIterator.pm new file mode 100644 index 00000000..6d4e6855 --- /dev/null +++ b/lib/HTFeed/RepositoryIterator.pm @@ -0,0 +1,137 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +#use FindBin; +#use lib "$FindBin::Bin/../../lib"; + +use File::Basename; +use File::Pairtree qw(ppath2id s2ppchars); + +package HTFeed::RepositoryIterator; + +# The only restriction on `path` is that it must have a component ending with `sdrX` +# where X is one or more digits +sub new { + my $class = shift; + my $path = shift; + + # Remove trailing slash from path if necessary + $path =~ s!/$!!; + my @pathcomp = split('/', $path); + # remove base & any empty components + #@pathcomp = grep { $_ ne '' } @pathcomp; + my $sdr_partition = undef; + if ($path =~ qr#/?sdr(\d+)/?#) { + $sdr_partition = $1; + } else { + die "Cannot infer SDR partition from $path"; + } + my $self = { + # The path to traverse. May be a subpath like /tmp/sdr1/obj/test + path => $path, + sdr_partition => $sdr_partition, + objects_processed => 0, + }; + bless($self, $class); + return $self; +} + +sub next_object { + my $self = shift; + + my $obj = undef; + while (1) { + my $line = readline($self->_find_pipe); + last unless defined $line; + chomp $line; + # ignore temporary location + next if $line =~ qr(obj/\.tmp); + next if $line =~ /\Qpre_uplift.mets.xml\E/; + #next if $self->_recent_previous_version($line); + + my ($file_objid, $path, $type) = File::Basename::fileparse($line, qr/\.mets\.xml/, qr/\.zip/); + # Remove trailing slash + $path =~ s!/$!!; + next if $self->{prev_path} and $path eq $self->{prev_path}; + + $self->{objects_processed}++; + $self->{prev_path} = $path; + + # Remove everything up to and including the `sdrX/` + my $subpath = $path; + $subpath =~ s!.*?sdr\d+/!!; + my @pathcomp = split('/', $subpath); + @pathcomp = grep { $_ ne '' } @pathcomp; + my $namespace = $pathcomp[1]; + my $directory_objid = $pathcomp[-1]; + my $objid = File::Pairtree::ppath2id(join('/', @pathcomp)); + $obj = { + path => $path, + namespace => $namespace, + # Caller should make sure all three of these are equivalent + objid => $objid, + file_objid => $file_objid, + directory_objid => $directory_objid, + # This is simple concatenation. Might be more interesting to return the actual contents of the directory. + #zipfile => "$path/$file_objid.zip", + #metsfile => "$path/$file_objid.mets.xml", + contents => $self->_contents($path), + }; + last; + } + return $obj; +} + +sub close { + my $self = shift; + + if ($self->{find_pipe}) { + close $self->{find_pipe}; + $self->{find_pipe} = undef; + } +} + +# Returns a sorted arrayref with filenames (not full paths) in +# an object directory. Excludes . and .. +sub _contents { + my $self = shift; + my $path = shift; + + my @contents; + opendir(my $dh, $path); + while ( my $file = readdir($dh) ) { + next if $file eq '.' or $file eq '..'; + push(@contents, $file); + } + @contents = sort @contents; + return \@contents; +} + +sub _find_pipe { + my $self = shift; + + if (!$self->{find_pipe}) { + my $find_pipe; + my $find_cmd = "find $self->{path} -follow -type f|"; + open($find_pipe, $find_cmd) or die("Can't open pipe to find: $!"); + $self->{find_pipe} = $find_pipe; + } + return $self->{find_pipe}; +} + +# NOTE: is this needed? +# Does file end with `.old` suffix and is it less than 48 hours old? +sub _recent_previous_version { + my $self = shift; + my $file = shift; + + if ($file =~ /.old$/) { + my $ctime = ( stat($file) )[10]; + my $ctime_age = time() - $ctime; + return 1 if $ctime_age < (86400 * 2); + } +} + +1; diff --git a/t/repository_iterator.t b/t/repository_iterator.t new file mode 100644 index 00000000..60ed247a --- /dev/null +++ b/t/repository_iterator.t @@ -0,0 +1,90 @@ +use strict; +use warnings; + +use File::Copy; +use File::Pairtree qw(id2ppath s2ppchars); +use File::Path; + +use Test::Spec; +use HTFeed::RepositoryIterator; + +describe "HTFeed::RepositoryIterator" => sub { + spec_helper 'storage_helper.pl'; + local our ($tmpdirs, $testlog); + + sub make_sdr_entry { + my $namespace = shift; + my $objid = shift; + + my $pt_objid = s2ppchars($objid); + my $pt_path = id2ppath($objid); + my $full_path = "/tmp/sdr1/obj/$namespace/$pt_path" . $pt_objid; + File::Path::make_path($full_path); + `touch $full_path/$pt_objid.mets.xml`; + `touch $full_path/$pt_objid.zip`; + } + + before all => sub { + my $namespace = 'test'; + my $objid = 'test'; + make_sdr_entry('ns1', 'objid1'); + make_sdr_entry('ns2', 'objid2'); + }; + + after all => sub { + File::Path::remove_tree('/tmp/sdr1'); + }; + + describe 'new' => sub { + it "creates an object that exposes the expected data" => sub { + my $iterator = HTFeed::RepositoryIterator->new('/tmp/sdr1'); + is($iterator->{path}, '/tmp/sdr1', 'it has the path we gave it'); + is($iterator->{sdr_partition}, 1, 'it has sdr partition of 1 from sdr1'); + }; + }; + + describe 'next_object' => sub { + it "returns an object with the expected data" => sub { + my $iterator = HTFeed::RepositoryIterator->new('/tmp/sdr1'); + my @objects; + my $object = $iterator->next_object; + is($object->{path}, '/tmp/sdr1/obj/ns1/pairtree_root/ob/ji/d1/objid1', 'path to the terminal directory'); + is($object->{namespace}, 'ns1', 'namespace `test` from path'); + is($object->{objid}, 'objid1', 'objid `objid1` from pairtree'); + is($object->{file_objid}, 'objid1', 'file_objid `objid1` from filename'); + is($object->{directory_objid}, 'objid1', 'directory_objid `objid1` from terminal directory name'); + is_deeply($object->{contents}, ['objid1.mets.xml','objid1.zip'], '.mets.xml and .zip contents'); + is($iterator->{objects_processed}, 1, 'it has processed 1 object'); + + }; + + it "returns two objects" => sub { + my $iterator = HTFeed::RepositoryIterator->new('/tmp/sdr1'); + while ($iterator->next_object) { } + is($iterator->{objects_processed}, 2, 'it has processed 2 objects'); + }; + + describe 'with a subdirectory' => sub { + it "returns an object with the expected data" => sub { + my $iterator = HTFeed::RepositoryIterator->new('/tmp/sdr1/obj/ns1/'); + my @objects; + my $object = $iterator->next_object; + is($object->{path}, '/tmp/sdr1/obj/ns1/pairtree_root/ob/ji/d1/objid1', 'path to the terminal directory'); + is($object->{namespace}, 'ns1', 'namespace `ns1` from path'); + is($object->{objid}, 'objid1', 'objid `objid1` from pairtree'); + is($object->{file_objid}, 'objid1', 'file_objid `objid1` from filename'); + is($object->{directory_objid}, 'objid1', 'directory_objid `objid1` from terminal directory name'); + is_deeply($object->{contents}, ['objid1.mets.xml','objid1.zip'], '.mets.xml and .zip contents'); + is($iterator->{objects_processed}, 1, 'it has processed 1 file'); + }; + + it "returns only one object" => sub { + my $iterator = HTFeed::RepositoryIterator->new('/tmp/sdr1/obj/ns1/'); + while ($iterator->next_object) { } + is($iterator->{objects_processed}, 1, 'it has processed 1 object'); + }; + }; + }; +}; + +runtests unless caller; From 3aeeb600b3b2325128808d5520ae39eb24fb5b08 Mon Sep 17 00:00:00 2001 From: Moses Hall Date: Fri, 6 Mar 2026 13:19:40 -0500 Subject: [PATCH 06/15] Ignore pairtree_prefix file --- lib/HTFeed/RepositoryIterator.pm | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/HTFeed/RepositoryIterator.pm b/lib/HTFeed/RepositoryIterator.pm index 6d4e6855..0328ceeb 100644 --- a/lib/HTFeed/RepositoryIterator.pm +++ b/lib/HTFeed/RepositoryIterator.pm @@ -46,6 +46,8 @@ sub next_object { my $line = readline($self->_find_pipe); last unless defined $line; chomp $line; + # Pairtree stuff + next if $line =~ /pairtree_prefix$/; # ignore temporary location next if $line =~ qr(obj/\.tmp); next if $line =~ /\Qpre_uplift.mets.xml\E/; From 7c0d2a3e07d9d660e5ee1d1b6d4fa5a2a00b76b5 Mon Sep 17 00:00:00 2001 From: "Brian \"Moses\" Hall" Date: Wed, 11 Mar 2026 12:36:29 -0400 Subject: [PATCH 07/15] - Revert main_repo_audit.pl - New code is truenas_audit.pl - TODO: several FIXMEs in the code, possible leftovers from main_repo_audit.pl inside truenas_audit.pl --- bin/audit/main_repo_audit.pl | 150 +++++++-------- bin/audit/truenas_audit.pl | 301 +++++++++++++++++++++++++++++++ lib/HTFeed/RepositoryIterator.pm | 13 +- t/main_repo_audit.t | 81 --------- t/repository_iterator.t | 2 - t/truenas_audit.t | 175 ++++++++++++++++++ 6 files changed, 548 insertions(+), 174 deletions(-) create mode 100755 bin/audit/truenas_audit.pl delete mode 100644 t/main_repo_audit.t create mode 100644 t/truenas_audit.t diff --git a/bin/audit/main_repo_audit.pl b/bin/audit/main_repo_audit.pl index 05b235f5..b795a51a 100755 --- a/bin/audit/main_repo_audit.pl +++ b/bin/audit/main_repo_audit.pl @@ -21,22 +21,22 @@ use Getopt::Long; use URI::Escape; -my $tombstone_check = "select is_tombstoned from feed_audit where namespace = ? and id = ? and storage_name = ?"; +my $tombstone_check = "select is_tombstoned from feed_audit where namespace = ? and id = ?"; my $insert = -"insert into feed_audit (namespace, id, storage_name, sdr_partition, zip_size, zip_date, mets_size, mets_date, lastchecked) values(?,?,?,?,?,?,?,?,CURRENT_TIMESTAMP) \ +"insert into feed_audit (namespace, id, sdr_partition, zip_size, zip_date, mets_size, mets_date, lastchecked) values(?,?,?,?,?,?,?,CURRENT_TIMESTAMP) \ ON DUPLICATE KEY UPDATE sdr_partition = ?, zip_size=?, zip_date =?,mets_size=?,mets_date=?,lastchecked = CURRENT_TIMESTAMP"; my $update = -"update feed_audit set md5check_ok = ?, lastmd5check = CURRENT_TIMESTAMP where namespace = ? and id = ? and storage_name = ?"; +"update feed_audit set md5check_ok = ?, lastmd5check = CURRENT_TIMESTAMP where namespace = ? and id = ?"; my $update_mets = -"update feed_audit set page_count = ?, image_size = ? where namespace = ? and id = ? and storage_name = ?"; +"update feed_audit set page_count = ?, image_size = ? where namespace = ? and id = ?"; my $insert_detail = -"insert into feed_audit_detail (namespace, id, storage_name, path, status, detail) values (?,?,?,?,?,?)"; +"insert into feed_audit_detail (namespace, id, path, status, detail) values (?,?,?,?,?)"; my $checkpoint_sel = -"select lastmd5check > ? from feed_audit where namespace = ? and id = ? and storage_name = ?"; +"select lastmd5check > ? from feed_audit where namespace = ? and id = ?"; ### set /sdr1 to /sdrX for test & parallelization my $filesProcessed = 0; @@ -44,23 +44,12 @@ my $do_md5 = 0; my $do_mets = 0; my $checkpoint = undef; -my $storage_name = undef; GetOptions( 'md5!' => \$do_md5, - 'mets!' => \$do_mets, - 'checkpoint=s' => \$checkpoint, - 'storage_name=s' => \$storage_name, +'mets!' => \$do_mets, + 'checkpoint=s' => \$checkpoint, ); - -# $storage_name must be one of 's3-truenas-ictc', 's3-truenas-macc' -if (!defined $storage_name) { - die '--storage_name is required'; -} -if ($storage_name ne 's3-truenas-macc' && $storage_name ne 's3-truenas-ictc') { - die "--storage_name must have value of 's3-truenas-macc' or 's3-truenas-ictc"; -} - my $base = shift @ARGV or die("Missing base directory.."); my ($sdr_partition) = ($base =~ qr#/?sdr(\d+)/?#); @@ -86,38 +75,36 @@ # print "$filesProcessed files processed\n"; # } - my ($pt_objid, $path, $type) = fileparse( $line, qr/\.mets\.xml/, qr/\.zip/ ); - $path =~ s/\/$//; # remove trailing / + # strip trailing / from path + my ( $pt_objid, $path, $type ) = + fileparse( $line, qr/\.mets\.xml/, qr/\.zip/ ); + $path =~ s/\/$//; # remove trailing / return if ( $prevpath and $path eq $prevpath ); - # check mtime on directory - do not check if mtime is in the past two days # to let synciq catch up - # Removed for ETT-1288 - #return if recently_modified_path($path); + return if recently_modified_path($path); $prevpath = $path; - # For testing, remove everything up to and including the `sdrX/` - my $subpath = $path; - $subpath =~ s!.*?sdr\d+/!!; - my @pathcomp = split( "/", $subpath ); + my @pathcomp = split( "/", $path ); # remove base & any empty components @pathcomp = grep { $_ ne '' } @pathcomp; + my $first_path = shift @pathcomp; my $last_path = pop @pathcomp; my $namespace = $pathcomp[1]; my $objid = ppath2id( join( "/", @pathcomp ) ); if ( $pt_objid ne s2ppchars($objid) ) { - set_status( $namespace, $objid, $storage_name, $path, "BAD_PAIRTREE", + set_status( $namespace, $objid, $path, "BAD_PAIRTREE", "$objid $pt_objid" ); } if ( $last_path ne $pt_objid ) { - set_status( $namespace, $objid, $storage_name, $path, "BAD_PAIRTREE", + set_status( $namespace, $objid, $path, "BAD_PAIRTREE", "$last_path $pt_objid" ); } @@ -150,14 +137,14 @@ $last_touched = $mets_seconds if defined $mets_seconds and (not defined $zip_seconds or $mets_seconds > $zip_seconds); #test symlinks unless we're traversing sdr1 or the file is too new - if ( $sdr_partition != 1 and (defined $last_touched and time - $last_touched >= 86400) ) { + if ( $first_path ne 'sdr1' and (defined $last_touched and time - $last_touched >= 86400) ) { my $link_path = join( "/", "/sdr1", @pathcomp, $last_path ); my $link_target = readlink $link_path - or set_status( $namespace, $objid, $storage_name, $path, "CANT_LSTAT", + or set_status( $namespace, $objid, $path, "CANT_LSTAT", "$link_path $!" ); if ( defined $link_target and $link_target ne $path ) { - set_status( $namespace, $objid, $storage_name, $path, "SYMLINK_INVALID", + set_status( $namespace, $objid, $path, "SYMLINK_INVALID", $link_target ); } @@ -167,7 +154,7 @@ execute_stmt( $insert, - $namespace, $objid, $storage_name, + $namespace, $objid, $sdr_partition, $zipsize, $zipdate, $metssize, $metsdate, @@ -196,7 +183,7 @@ $found_zip++ if $ext eq 'zip'; $found_mets++ if $ext eq 'mets.xml'; if ( $pt_objid ne $dir_barcode ) { - set_status( $namespace, $objid, $storage_name, $path, "BARCODE_MISMATCH", + set_status( $namespace, $objid, $path, "BARCODE_MISMATCH", "$pt_objid $dir_barcode" ); } $filecount++; @@ -204,28 +191,27 @@ closedir($dh); - # Removed for ETT-1288 - # check file count; do md5 check and METS extraction stuff, but only if it's fully replicated - #if ( ( defined $zip_seconds and time - $zip_seconds > 86400 ) - # or ( defined $mets_seconds and time - $mets_seconds > 86400 ) ) +# check file count; do md5 check and METS extraction stuff, but only if it's fully replicated + if ( ( defined $zip_seconds and time - $zip_seconds > 86400 ) + or ( defined $mets_seconds and time - $mets_seconds > 86400 ) ) { - if ( $filecount > 2 or $filecount < 1 or ($found_zip != 1 and not is_tombstoned($namespace,$objid,$storage_name) ) or $found_mets != 1 ) { - set_status( $namespace, $objid, $storage_name, $path, "BAD_FILECOUNT", + if ( $filecount > 2 or $filecount < 1 or ($found_zip != 1 and not is_tombstoned($namespace,$objid) ) or $found_mets != 1 ) { + set_status( $namespace, $objid, $path, "BAD_FILECOUNT", "zip=$found_zip mets=$found_mets total=$filecount" ); } eval { - my $rval = zipcheck( $namespace, $objid, $storage_name ); + my $rval = zipcheck( $namespace, $objid ); if ($rval) { - execute_stmt( $update, "1", $namespace, $objid, $storage_name ); + execute_stmt( $update, "1", $namespace, $objid ); } elsif ( defined $rval ) { - execute_stmt( $update, "0", $namespace, $objid, $storage_name ); + execute_stmt( $update, "0", $namespace, $objid ); } }; if ($@) { - set_status( $namespace, $objid, $storage_name, $path, "CANT_ZIPCHECK", $@ ); + set_status( $namespace, $objid, $path, "CANT_ZIPCHECK", $@ ); } } @@ -237,15 +223,15 @@ } sub zipcheck { - my ( $namespace, $objid, $storage_name ) = @_; + my ( $namespace, $objid ) = @_; return unless $do_md5 or $do_mets; - return if is_tombstoned($namespace, $objid, $storage_name); + return if is_tombstoned($namespace, $objid); # don't check this item if we just looked at it if(defined $checkpoint) { - my $sth = execute_stmt($checkpoint_sel,$checkpoint,$namespace,$objid,$storage_name); + my $sth = execute_stmt($checkpoint_sel,$checkpoint,$namespace,$objid); if(my @row = $sth->fetchrow_array()) { return if @row and $row[0]; } @@ -280,8 +266,7 @@ sub zipcheck { } if ( not defined $mets_zipsum or length($mets_zipsum) ne 32 ) { - set_status( $namespace, $objid, $storage_name, - $volume->get_repository_mets_path(), + set_status( $namespace, $objid, $volume->get_repository_mets_path(), "MISSING_METS_CHECKSUM", undef ); } else { @@ -292,7 +277,7 @@ sub zipcheck { $rval = 1; } else { - set_status( $namespace, $objid, $storage_name, + set_status( $namespace, $objid, $volume->get_repository_zip_path(), "BAD_CHECKSUM", "expected=$mets_zipsum actual=$realsum" ); $rval = 0; @@ -312,7 +297,7 @@ sub zipcheck { $filetypes{$extension}++; } while ( my ( $ext, $count ) = each(%filetypes) ) { - mets_log( $namespace, $objid, $storage_name, "FILETYPE", $ext, $count ); + mets_log( $namespace, $objid, "FILETYPE", $ext, $count ); } } @@ -325,7 +310,7 @@ sub zipcheck { $premisversion = "premis2"; } - mets_log( $namespace, $objid, $storage_name, "PREMIS_VERSION", $premisversion ); + mets_log( $namespace, $objid, "PREMIS_VERSION", $premisversion ); } { # PREMIS event ID types @@ -340,7 +325,7 @@ sub zipcheck { $event_id_types{ $mets->findvalue( '.', $eventtype ) }++; } foreach my $event_id_type ( keys(%event_id_types) ) { - mets_log( $namespace, $objid, $storage_name, "PREMIS_EVENT_TYPE", + mets_log( $namespace, $objid, "PREMIS_EVENT_TYPE", $event_id_type, $event_id_types{$event_id_type} ); } } @@ -356,7 +341,7 @@ sub zipcheck { $agent_id_types{ $mets->findvalue( '.', $agenttype ) }++; } foreach my $agent_id_type ( keys(%agent_id_types) ) { - mets_log( $namespace, $objid, $storage_name, "PREMIS_AGENT_TYPE", + mets_log( $namespace, $objid, "PREMIS_AGENT_TYPE", $agent_id_type, $agent_id_types{$agent_id_type} ); } @@ -376,7 +361,7 @@ sub zipcheck { my $date = $mets->findvalue( './premis:eventDateTime', $event ); - mets_log( $namespace, $objid, $storage_name, "CAPTURE", $executor, $date ); + mets_log( $namespace, $objid, "CAPTURE", $executor, $date ); } } { # Processing agent @@ -393,7 +378,7 @@ sub zipcheck { my $date = $mets->findvalue( './premis:eventDateTime', $event ); - mets_log( $namespace, $objid, $storage_name, "MD5SUM", $executor, $date ); + mets_log( $namespace, $objid, "MD5SUM", $executor, $date ); } } @@ -407,14 +392,14 @@ sub zipcheck { my $date = $mets->findvalue( './premis:eventDateTime', $event ); - mets_log( $namespace, $objid, $storage_name, "INGEST", $date ); + mets_log( $namespace, $objid, "INGEST", $date ); } } { # MARC present my $marc_present = $mets->findvalue('count(//marc:record | //record)'); - mets_log( $namespace, $objid, $storage_name, "MARC", $marc_present ); + mets_log( $namespace, $objid, "MARC", $marc_present ); } { # METS valid @@ -425,7 +410,7 @@ sub zipcheck { $error =~ s/\n/ /mg; } - mets_log( $namespace, $objid, $storage_name, "METS_VALID", $mets_valid, $error ); + mets_log( $namespace, $objid, "METS_VALID", $mets_valid, $error ); } { @@ -442,7 +427,7 @@ sub zipcheck { push( @mdbits, "$attr=$attrval" ); } } - mets_log( $namespace, $objid, $storage_name, "METS_MDSEC", + mets_log( $namespace, $objid, "METS_MDSEC", join( "; ", @mdbits ) ); } } @@ -451,15 +436,15 @@ sub zipcheck { { # Page tagging, image size my $has_pagetags = $mets->findvalue( 'count(//mets:div[@TYPE="page"]/@LABEL[string() != ""])'); - mets_log( $namespace, $objid, $storage_name, "PAGETAGS", $has_pagetags ); + mets_log( $namespace, $objid, "PAGETAGS", $has_pagetags ); my $pages = $mets->findvalue('count(//mets:div[@TYPE="page"])'); - mets_log( $namespace, $objid, $storage_name, "PAGES", $pages ); + mets_log( $namespace, $objid, "PAGES", $pages ); my $image_size = $mets->findvalue('sum(//mets:fileGrp[@USE="image"]/mets:file/@SIZE)'); - mets_log( $namespace, $objid, $storage_name, "IMAGE_SIZE", $image_size); + mets_log( $namespace, $objid, "IMAGE_SIZE", $image_size); - execute_stmt($update_mets,$pages,$image_size,$namespace,$objid,$storage_name); + execute_stmt($update_mets,$pages,$image_size,$namespace,$objid); } @@ -501,16 +486,16 @@ sub extract_source_mets { } } if ( !@srcmets ) { - set_status( $namespace, $objid, $storage_name, $zipfile, "NO_SOURCE_METS", undef ); + set_status( $namespace, $objid, $zipfile, "NO_SOURCE_METS", undef ); } elsif ( @srcmets != 1 ) { - set_status( $namespace, $objid, $storage_name, $zipfile, + set_status( $namespace, $objid, $zipfile, "MULTIPLE_SOURCE_METS_CANDIDATES", undef ); } else { # source METS found - mets_log( $namespace, $objid, $storage_name, "SOURCE_METS", $srcmets[0] ); + mets_log( $namespace, $objid, "SOURCE_METS", $srcmets[0] ); system("cd /tmp; unzip -j '$zipfile' '$srcmets[0]'"); my ($smets_name) = ( $srcmets[0] =~ /\/([^\/]+)$/ ); my $tmp_smets_loc = "/tmp/$smets_name"; @@ -530,13 +515,13 @@ sub extract_source_mets { $mdsecs{ join( '; ', @mdbits ) } = 1; } foreach my $mdsec ( sort( keys(%mdsecs) ) ) { - mets_log( $namespace, $objid, $storage_name, "SRC_METS_MDSEC", $mdsec ); + mets_log( $namespace, $objid, "SRC_METS_MDSEC", $mdsec ); } # Try to get Google reading order foreach my $tag (qw(gbs:pageOrder gbs:pageSequence gbs:coverTag)) { my $val = $xpc->findvalue("//$tag"); - mets_log( $namespace, $objid, $storage_name, "GBS_READING", $tag, $val ); + mets_log( $namespace, $objid, "GBS_READING", $tag, $val ); } foreach my $techmd ( $xpc->findnodes("//mets:techMD") ) { @@ -547,14 +532,14 @@ sub extract_source_mets { my $count = $xpc->findvalue( "count(//mets:file[contains(\@ADMID,\"$imagemethod_id\")])" ); - mets_log( $namespace, $objid, $storage_name, "IMAGE_METHOD", $method, + mets_log( $namespace, $objid, "IMAGE_METHOD", $method, $count ); } } }; if ($@) { - set_status( $namespace, $objid, $storage_name, $srcmets[0], "BAD_SOURCE_METS", + set_status( $namespace, $objid, $srcmets[0], "BAD_SOURCE_METS", $@ ); } @@ -571,7 +556,7 @@ sub mets_log { my $val2 = shift; $val1 = '' if not defined $val1; $val2 = '' if not defined $val2; - print join( "\t", $namespace, $objid, $storage_name, $key, $val1, $val2 ), "\n"; + print join( "\t", $namespace, $objid, $key, $val1, $val2 ), "\n"; #execute_stmt($fs_mets_data,$namespace,$objid,$key,$val1,$val2); } @@ -579,7 +564,7 @@ sub mets_log { sub is_tombstoned { my $namespace = shift; my $objid = shift; - my $sth = execute_stmt($tombstone_check,$namespace,$objid,$storage_name); + my $sth = execute_stmt($tombstone_check,$namespace,$objid); if(my @row = $sth->fetchrow_array()) { return $row[0]; } else { @@ -587,15 +572,14 @@ sub is_tombstoned { } } -# Removed for ETT-1288 -# sub recently_modified_path { -# my $path = shift; -# -# my $mtime = ( stat($path) )[9]; -# my $mtime_age = time() - $mtime; -# -# return 1 if $mtime_age < (86400 * 2); -# } +sub recently_modified_path { + my $path = shift; + + my $mtime = ( stat($path) )[9]; + my $mtime_age = time() - $mtime; + + return 1 if $mtime_age < (86400 * 2); +} sub recent_previous_version { my $file = shift; diff --git a/bin/audit/truenas_audit.pl b/bin/audit/truenas_audit.pl new file mode 100755 index 00000000..4313e570 --- /dev/null +++ b/bin/audit/truenas_audit.pl @@ -0,0 +1,301 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use Data::Dumper; +use DBI; +use File::Basename; +use File::Pairtree qw(ppath2id s2ppchars); +use FindBin; +use POSIX qw(strftime); +use Getopt::Long; +use URI::Escape; + +use lib "$FindBin::Bin/../../lib"; +use HTFeed::Config qw(get_config); +use HTFeed::DBTools qw(get_dbh); +use HTFeed::Log {root_logger => 'INFO, screen'}; +use HTFeed::METS; +use HTFeed::Namespace; +use HTFeed::PackageType; +use HTFeed::RepositoryIterator; +use HTFeed::Volume; +use HTFeed::VolumeValidator; + + +# FIXME: is this needed? +my $tombstone_check = "select is_tombstoned from feed_audit where namespace = ? and id = ?"; + +my $insert = +"insert into feed_storage (namespace, id, storage_name, zip_size, mets_size, lastchecked) values(?,?,?,?,?,CURRENT_TIMESTAMP) \ +ON DUPLICATE KEY UPDATE zip_size=?, mets_size=?, lastchecked = CURRENT_TIMESTAMP"; + +my $update = +"update feed_storage set md5check_ok = ?, lastmd5check = CURRENT_TIMESTAMP where namespace = ? and id = ? and storage_name = ?"; + +my $insert_detail = +"insert into feed_audit_detail (namespace, id, storage_name, path, status, detail) values (?,?,?,?,?,?)"; + +my $checkpoint_sel = +"select lastmd5check > ? from feed_storage where namespace = ? and id = ?"; + +### set /sdr1 to /sdrX for test & parallelization + +my $do_md5 = 0; +my $checkpoint = undef; +my $noop = undef; +my $storage_name = undef; +GetOptions( + 'md5!' => \$do_md5, + 'checkpoint=s' => \$checkpoint, + 'noop' => \$noop, + 'storage_name=s' => \$storage_name, +); + +# $storage_name must be one of 's3-truenas-ictc', 's3-truenas-macc' +if (!defined $storage_name) { + die '--storage_name is required'; +} +if ($storage_name ne 's3-truenas-macc' && $storage_name ne 's3-truenas-ictc') { + die "--storage_name must have value of 's3-truenas-macc' or 's3-truenas-ictc"; +} + +my $base = shift @ARGV or die("Missing base directory.."); + +my $iterator = HTFeed::RepositoryIterator->new($base); + + +while (my $obj = $iterator->next_object) { + my $sdr_partition = $obj->{sdr_partition}; + my $path = $obj->{path}; + my $namespace = $obj->{namespace}; + my $objid = $obj->{objid}; + eval { + if ($obj->{directory_objid} ne $objid) { + set_status( $namespace, $objid, $storage_name, $path, "BAD_PAIRTREE", + "$objid $obj->{directory_objid}" ); + } + + #get last modified date + my $zipfile = "$obj->{path}/$obj->{objid}.zip"; + my $zip_seconds; + my $zipdate; + my $zipsize; + + if ( -e $zipfile ) { + $zip_seconds = ( stat($zipfile) )[9]; + $zipdate = strftime( "%Y-%m-%d %H:%M:%S", localtime($zip_seconds) ); + $zipsize = -s $zipfile; + } + + my $metsfile = "$obj->{path}/$obj->{objid}.mets.xml"; + + my $mets_seconds; + my $metsdate; + my $metssize; + + if ( -e $metsfile ) { + $mets_seconds = ( stat($metsfile) )[9]; + $metssize = -s $metsfile; + $metsdate = strftime( "%Y-%m-%d %H:%M:%S", + localtime( ( stat($metsfile) )[9] ) ); + } + + my $last_touched = $zip_seconds; + $last_touched = $mets_seconds if defined $mets_seconds and (not defined $zip_seconds or $mets_seconds > $zip_seconds); + + # FIXME: I don't know if this is needed and if it is this is old code from main_repo_audit.pl so it needs fixin' + #test symlinks unless we're traversing sdr1 or the file is too new + # if ( $first_path ne 'sdr1' and (defined $last_touched and time - $last_touched >= 86400) ) { +# my $link_path = join( "/", "/sdr1", @pathcomp, $last_path ); +# my $link_target = readlink $link_path +# or set_status( $namespace, $objid, $path, "CANT_LSTAT", +# "$link_path $!" ); +# +# if ( defined $link_target and $link_target ne $path ) { +# set_status( $namespace, $objid, $path, "SYMLINK_INVALID", +# $link_target ); +# } +# +# } + + + #insert + execute_stmt( + $insert, + $namespace, $objid, $storage_name, + $zipsize, $metssize, + # duplicate parameters for duplicate key update + $zipsize, $metssize, + ); + + # does barcode have a zip & xml, and do they match? + opendir( my $dh, $path ); + + my $filecount = 0; + my $found_zip = 0; + my $found_mets = 0; + foreach my $file (@{$obj->{contents}}) { + next if $file =~ /pre_uplift.mets.xml$/; # ignore backup mets + if ( $file !~ /^([^.]+)\.(zip|mets.xml)$/ ) { + set_status($namespace, $objid, $storage_name, $path, "BAD_FILE", "$file"); + next; + } + my $dir_barcode = $1; + my $ext = $2; + $found_zip++ if $ext eq 'zip'; + $found_mets++ if $ext eq 'mets.xml'; + if ($objid ne $dir_barcode) { + set_status($namespace, $objid, $storage_name, $path, "BARCODE_MISMATCH", "$objid $dir_barcode"); + } + $filecount++; + } + + # check file count; do md5 check and METS extraction stuff + if (defined $zip_seconds || defined $mets_seconds) { + if ( $filecount > 2 or $filecount < 1 or ($found_zip != 1 and not is_tombstoned($namespace,$objid) ) or $found_mets != 1 ) { + set_status( $namespace, $objid, $storage_name, $path, "BAD_FILECOUNT", + "zip=$found_zip mets=$found_mets total=$filecount" ); + } + + eval { + my $rval = zipcheck( $namespace, $objid, $storage_name ); + if ($rval) { + execute_stmt( $update, "1", $namespace, $objid, $storage_name ); + } + elsif ( defined $rval ) { + execute_stmt( $update, "0", $namespace, $objid, $storage_name ); + } + }; + if ($@) { + set_status( $namespace, $objid, $storage_name, $path, "CANT_ZIPCHECK", $@ ); + } + } + + }; + + if ($@) { + warn($@); + } +} + +get_dbh()->disconnect(); +$iterator->close; + +sub zipcheck { + my ( $namespace, $objid, $storage_name ) = @_; + + return unless $do_md5; + + return if is_tombstoned($namespace, $objid); + + # don't check this item if we just looked at it + if(defined $checkpoint) { + my $sth = execute_stmt($checkpoint_sel,$checkpoint,$namespace,$objid); + if(my @row = $sth->fetchrow_array()) { + return if @row and $row[0]; + } + } + + # use google as a 'default' namespace for now + my $volume = new HTFeed::Volume( + packagetype => "pkgtype", + namespace => $namespace, + objid => $objid + ); + my $mets = $volume->get_repository_mets_xpc(); + my $rval = undef; + +# Extract the checksum for the zip file that looks kind of like this: +# +# +# +# +# + + if ($do_md5) { + my $zipname = $volume->get_zip_filename(); + my $mets_zipsum = $mets->findvalue( + "//mets:file[mets:FLocat/\@xlink:href='$zipname']/\@CHECKSUM"); + + if(not defined $mets_zipsum or length($mets_zipsum) ne 32) { + # zip name may be uri-escaped in some cases + $zipname = uri_escape($zipname); + $mets_zipsum = $mets->findvalue( + "//mets:file[mets:FLocat/\@xlink:href='$zipname']/\@CHECKSUM"); + } + + if ( not defined $mets_zipsum or length($mets_zipsum) ne 32 ) { + set_status( $namespace, $objid, $storage_name, + $volume->get_repository_mets_path(), + "MISSING_METS_CHECKSUM", undef ); + } + else { + my $realsum = HTFeed::VolumeValidator::md5sum( + $volume->get_repository_zip_path() ); + if ( $mets_zipsum eq $realsum ) { + print "$zipname OK\n"; + $rval = 1; + } + else { + set_status( $namespace, $objid, $storage_name, + $volume->get_repository_zip_path(), + "BAD_CHECKSUM", "expected=$mets_zipsum actual=$realsum" ); + $rval = 0; + } + } + } + return $rval; +} + +sub set_status { + warn( join( " ", @_ ), "\n" ); + execute_stmt( $insert_detail, @_ ); +} + +sub execute_stmt { + my $stmt = shift; + + # Bail out if noop and the SQL statement is mutating, SELECT is okay + return if $noop and ($stmt =~ /^insert|update/); + + my $dbh = get_dbh(); + my $sth = $dbh->prepare($stmt); + $sth->execute(@_); + return $sth; +} + +sub is_tombstoned { + my $namespace = shift; + my $objid = shift; + my $sth = execute_stmt($tombstone_check,$namespace,$objid); + if(my @row = $sth->fetchrow_array()) { + return $row[0]; + } else { + return 0; + } +} + +sub recently_modified_path { + my $path = shift; + + my $mtime = ( stat($path) )[9]; + my $mtime_age = time() - $mtime; + + return 1 if $mtime_age < (86400 * 2); +} + +sub recent_previous_version { + my $file = shift; + + return unless $file =~ /.old$/; + + my $ctime = ( stat($file) )[10]; + my $ctime_age = time() - $ctime; + + return 1 if $ctime_age < (86400 * 2); + +} + +__END__ diff --git a/lib/HTFeed/RepositoryIterator.pm b/lib/HTFeed/RepositoryIterator.pm index 0328ceeb..e2e18c25 100644 --- a/lib/HTFeed/RepositoryIterator.pm +++ b/lib/HTFeed/RepositoryIterator.pm @@ -50,10 +50,10 @@ sub next_object { next if $line =~ /pairtree_prefix$/; # ignore temporary location next if $line =~ qr(obj/\.tmp); - next if $line =~ /\Qpre_uplift.mets.xml\E/; + #next if $line =~ /\Qpre_uplift.mets.xml\E/; #next if $self->_recent_previous_version($line); - my ($file_objid, $path, $type) = File::Basename::fileparse($line, qr/\.mets\.xml/, qr/\.zip/); + my ($file_objid, $path, $type) = File::Basename::fileparse($line); # Remove trailing slash $path =~ s!/$!!; next if $self->{prev_path} and $path eq $self->{prev_path}; @@ -67,18 +67,15 @@ sub next_object { my @pathcomp = split('/', $subpath); @pathcomp = grep { $_ ne '' } @pathcomp; my $namespace = $pathcomp[1]; - my $directory_objid = $pathcomp[-1]; + my $directory_objid = $pathcomp[-1]; my $objid = File::Pairtree::ppath2id(join('/', @pathcomp)); $obj = { path => $path, namespace => $namespace, - # Caller should make sure all three of these are equivalent + # Caller should make sure objid and directory_objid are equivalent, + # and also that objid matches the contents objid => $objid, - file_objid => $file_objid, directory_objid => $directory_objid, - # This is simple concatenation. Might be more interesting to return the actual contents of the directory. - #zipfile => "$path/$file_objid.zip", - #metsfile => "$path/$file_objid.mets.xml", contents => $self->_contents($path), }; last; diff --git a/t/main_repo_audit.t b/t/main_repo_audit.t deleted file mode 100644 index 418e72ed..00000000 --- a/t/main_repo_audit.t +++ /dev/null @@ -1,81 +0,0 @@ -use Test::Spec; -use HTFeed::DBTools qw(get_dbh); -use HTFeed::Storage::LocalPairtree; -use File::Copy; -use File::Pairtree qw(id2ppath s2ppchars); - -use strict; - -describe "bin/audit/main_repo_audit.pl" => sub { - spec_helper 'storage_helper.pl'; - local our ($tmpdirs, $testlog); - - sub local_storage { - my $volume = stage_volume($tmpdirs,@_); - - my $storage = HTFeed::Storage::LocalPairtree->new( - name => 'localpairtree-test', - volume => $volume, - config => { - obj_dir => $tmpdirs->{obj_dir} - } - ); - return $storage; - } - - sub count_feed_audit_entries { - my $namespace = shift; - my $objid = shift; - my $storage_name = shift; - my $sdr_partition = shift; - - my $sql = 'SELECT COUNT(*) FROM feed_audit WHERE namespace=? AND id=? AND storage_name=? AND sdr_partition=?'; - my $sth = get_dbh()->prepare($sql); - $sth->execute($namespace, $objid, $storage_name, $sdr_partition); - if (my @row = $sth->fetchrow_array()) { - return $row[0]; - } else { - return 0; - } - } - - before each => sub { - my $namespace = 'test'; - my $objid = 'test'; - my $storage = local_storage($namespace, $objid); - $storage->stage; - $storage->make_object_path; - $storage->move; - my $pt_objid = s2ppchars($objid); - my $pt_path = id2ppath($objid); - # main_repo_audit.pl can infer its sdr partition when it isn't at the root of the - # filesystem but the `tmpdirs` random names will throw it off completely. Hence we - # copy to a location where we can put "sdr1" in the path. - File::Path::make_path('/tmp/sdr1/obj'); - `cp -r $tmpdirs->{obj_dir}/* /tmp/sdr1/obj/`; - # This is just conforming to `etc/config_test.yml` so Volume.pm can find the files. - File::Path::make_path("/tmp/obj_link/test/$pt_path"); - `ln -s /tmp/sdr1/obj/test/$pt_path/$pt_objid /tmp/obj_link/test/$pt_path`; - }; - - after each => sub { - File::Path::remove_tree('/tmp/sdr1'); - File::Path::remove_tree('/tmp/obj_link'); - }; - - describe 'at macc' => sub { - it "succeeds" => sub { - `bin/audit/main_repo_audit.pl --md5 --storage_name s3-truenas-macc /tmp/sdr1`; - is(count_feed_audit_entries('test', 'test', 's3-truenas-macc', 1), 1, 'one feed_audit entry'); - }; - }; - - describe 'at ictc' => sub { - it "succeeds" => sub { - `bin/audit/main_repo_audit.pl --md5 --storage_name s3-truenas-ictc /tmp/sdr1`; - is(count_feed_audit_entries('test', 'test', 's3-truenas-ictc', 1), 1, 'one feed_audit entry'); - }; - }; -}; - -runtests unless caller; diff --git a/t/repository_iterator.t b/t/repository_iterator.t index 60ed247a..1db6fd60 100644 --- a/t/repository_iterator.t +++ b/t/repository_iterator.t @@ -51,7 +51,6 @@ describe "HTFeed::RepositoryIterator" => sub { is($object->{path}, '/tmp/sdr1/obj/ns1/pairtree_root/ob/ji/d1/objid1', 'path to the terminal directory'); is($object->{namespace}, 'ns1', 'namespace `test` from path'); is($object->{objid}, 'objid1', 'objid `objid1` from pairtree'); - is($object->{file_objid}, 'objid1', 'file_objid `objid1` from filename'); is($object->{directory_objid}, 'objid1', 'directory_objid `objid1` from terminal directory name'); is_deeply($object->{contents}, ['objid1.mets.xml','objid1.zip'], '.mets.xml and .zip contents'); is($iterator->{objects_processed}, 1, 'it has processed 1 object'); @@ -72,7 +71,6 @@ describe "HTFeed::RepositoryIterator" => sub { is($object->{path}, '/tmp/sdr1/obj/ns1/pairtree_root/ob/ji/d1/objid1', 'path to the terminal directory'); is($object->{namespace}, 'ns1', 'namespace `ns1` from path'); is($object->{objid}, 'objid1', 'objid `objid1` from pairtree'); - is($object->{file_objid}, 'objid1', 'file_objid `objid1` from filename'); is($object->{directory_objid}, 'objid1', 'directory_objid `objid1` from terminal directory name'); is_deeply($object->{contents}, ['objid1.mets.xml','objid1.zip'], '.mets.xml and .zip contents'); is($iterator->{objects_processed}, 1, 'it has processed 1 file'); diff --git a/t/truenas_audit.t b/t/truenas_audit.t new file mode 100644 index 00000000..b01b9d40 --- /dev/null +++ b/t/truenas_audit.t @@ -0,0 +1,175 @@ +use Test::Spec; +use HTFeed::DBTools qw(get_dbh); +use HTFeed::Storage::LocalPairtree; +use Data::Dumper; +use File::Copy; +use File::Pairtree qw(id2ppath s2ppchars); + +use strict; +use warnings; + +describe "bin/audit/main_repo_audit.pl" => sub { + spec_helper 'storage_helper.pl'; + local our ($tmpdirs, $testlog); + + sub local_storage { + my $volume = stage_volume($tmpdirs,@_); + + my $storage = HTFeed::Storage::LocalPairtree->new( + name => 'localpairtree-test', + volume => $volume, + config => { + obj_dir => $tmpdirs->{obj_dir} + } + ); + return $storage; + } + + # Returns the data as arrayref of hashref + sub get_feed_storage_data { + my $namespace = shift; + my $objid = shift; + my $storage_name = shift; + + my $data = []; + my $sql = 'SELECT * FROM feed_storage WHERE namespace=? AND id=? AND storage_name=?'; + my $sth = get_dbh()->prepare($sql); + $sth->execute($namespace, $objid, $storage_name); + push(@$data, $sth->fetchrow_hashref); + return $data; + } + + # Returns the data as arrayref of hashref + sub get_feed_audit_detail_data { + my $namespace = shift; + my $objid = shift; + my $storage_name = shift; + + my $data = []; + my $sql = 'SELECT * FROM feed_audit_detail WHERE namespace=? AND id=? AND storage_name=?'; + my $sth = get_dbh()->prepare($sql); + $sth->execute($namespace, $objid, $storage_name); + push(@$data, $sth->fetchrow_hashref); + return $data; + } + + before each => sub { + my $namespace = 'test'; + my $objid = 'test'; + my $storage = local_storage($namespace, $objid); + $storage->stage; + $storage->make_object_path; + $storage->move; + my $pt_objid = s2ppchars($objid); + my $pt_path = id2ppath($objid); + # truenas_audit.pl can infer its sdr partition when it isn't at the root of the + # filesystem but the `tmpdirs` random names will throw it off completely. Hence we + # copy to a location where we can put "sdr1" in the path. + File::Path::make_path('/tmp/sdr1/obj'); + `cp -r $tmpdirs->{obj_dir}/* /tmp/sdr1/obj/`; + # This is just conforming to `etc/config_test.yml` so Volume.pm can find the files. + File::Path::make_path("/tmp/obj_link/test/$pt_path"); + `ln -s /tmp/sdr1/obj/test/$pt_path/$pt_objid /tmp/obj_link/test/$pt_path`; + }; + + after each => sub { + File::Path::remove_tree('/tmp/sdr1'); + File::Path::remove_tree('/tmp/obj_link'); + get_dbh->prepare('DELETE FROM feed_storage')->execute; + get_dbh->prepare('DELETE FROM feed_audit_detail')->execute; + }; + + foreach my $storage_name (('s3-truenas-macc', 's3-truenas-ictc')) { + it "writes to feed_storage" => sub { + `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name /tmp/sdr1`; + my $db_data = get_feed_storage_data('test', 'test', $storage_name); + is(scalar(@$db_data), 1, 'with only one initial entry'); + is($db_data->[0]->{namespace}, 'test', 'correct namespace'); + is($db_data->[0]->{id}, 'test', 'correct id'); + is($db_data->[0]->{storage_name}, $storage_name, 'correct storage_name'); + ok($db_data->[0]->{zip_size} > 0, 'nonzero zip_size'); + ok($db_data->[0]->{mets_size} > 0, 'nonzero mets_size'); + ok(!defined $db_data->[0]->{saved_md5sum}, 'not defined saved_md5sum'); + ok(defined $db_data->[0]->{deposit_time}, 'defined deposit_time'); + ok(defined $db_data->[0]->{lastchecked}, 'defined lastchecked'); + ok(defined $db_data->[0]->{lastmd5check}, 'defined lastmd5check'); + is($db_data->[0]->{md5check_ok}, 1, 'md5check_ok=1'); + }; + } + + # If existing data, only `lastchecked` and `lastmd5check` will change + # (file sizes will also be updated but with the same data). + it "updates existing data" => sub { + my $storage_name = 's3-truenas-macc'; + `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name /tmp/sdr1`; + my $db_data = get_feed_storage_data('test', 'test', $storage_name); + is(scalar(@$db_data), 1, 'with only one initial entry'); + my $old_lastchecked = $db_data->[0]->{lastchecked}; + my $old_lastmd5check = $db_data->[0]->{lastmd5check}; + sleep 1; + `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name /tmp/sdr1`; + $db_data = get_feed_storage_data('test', 'test', $storage_name); + my $new_lastchecked = $db_data->[0]->{lastchecked}; + my $new_lastmd5check = $db_data->[0]->{lastmd5check}; + is(scalar(@$db_data), 1, 'with only one final entry'); + isnt($old_lastchecked, $new_lastchecked, 'with changed `lastchecked`'); + isnt($old_lastmd5check, $new_lastmd5check, 'with changed `lastmd5check`'); + }; + + it "records a failed MD5 check" => sub { + my $storage_name = 's3-truenas-macc'; + my $objid = 'test'; + # Fiddle with the zip + my $pt_objid = s2ppchars($objid); + my $pt_path = id2ppath($objid); + my $zip_path = "/tmp/sdr1/obj/test/$pt_path$pt_objid/" . "$objid.zip"; + open(my $fh, '>', $zip_path) or die "open zip file $zip_path failed: $!"; + print $fh "shwoozle\n"; + close($fh); + `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name /tmp/sdr1`; + my $db_data = get_feed_storage_data('test', 'test', $storage_name); + is(scalar(@$db_data), 1, 'with only one initial feed_storage entry'); + ok(defined $db_data->[0]->{lastchecked}, 'defined lastchecked'); + ok(defined $db_data->[0]->{lastmd5check}, 'defined lastmd5check'); + is($db_data->[0]->{md5check_ok}, 0, 'md5check_ok=0'); + my $detail_data = get_feed_audit_detail_data('test', 'test', $storage_name); + is(scalar(@$detail_data), 1, 'with one feed_audit_detail entry'); + is($detail_data->[0]->{namespace}, 'test', 'feed_audit_detail namespace'); + is($detail_data->[0]->{id}, 'test', 'feed_audit_detail id'); + is($detail_data->[0]->{storage_name}, $storage_name, 'feed_audit_detail storage_name'); + # The path for these examples is via the symlink, so it will be different from the $zip_path we fiddled with + ok($detail_data->[0]->{path} =~ /\.zip$/, 'feed_audit_detail path'); + is($detail_data->[0]->{status}, 'BAD_CHECKSUM', 'feed_audit_detail status'); + ok($detail_data->[0]->{detail} =~ /expected=/, 'feed_audit_detail detail'); + ok(defined $detail_data->[0]->{time}, 'feed_audit_detail time defined'); + }; + + it "records a spurious file but ignores pre-uplift METS" => sub { + my $storage_name = 's3-truenas-macc'; + my $objid = 'test'; + # Add a silly file and a pre-uplift file + my $pt_objid = s2ppchars($objid); + my $pt_path = id2ppath($objid); + foreach my $ext (('silly', 'pre_uplift.mets.xml')) { + my $path = "/tmp/sdr1/obj/test/$pt_path$pt_objid/" . "$objid.$ext"; + open(my $fh, '>', $path) or die "open file $path failed: $!"; + print $fh "shwoozle\n"; + close($fh); + } + `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name /tmp/sdr1`; + my $db_data = get_feed_storage_data('test', 'test', $storage_name); + is(scalar(@$db_data), 1, 'with only one feed_storage entry'); + is($db_data->[0]->{md5check_ok}, 1, 'md5check_ok=1'); + my $detail_data = get_feed_audit_detail_data('test', 'test', $storage_name); + is(scalar(@$detail_data), 1, 'with one feed_audit_detail entry'); + is($detail_data->[0]->{namespace}, 'test', 'feed_audit_detail namespace'); + is($detail_data->[0]->{id}, 'test', 'feed_audit_detail id'); + is($detail_data->[0]->{storage_name}, $storage_name, 'feed_audit_detail storage_name'); + ok(defined $detail_data->[0]->{path}, 'feed_audit_detail path defined'); + is($detail_data->[0]->{status}, 'BAD_FILE', 'feed_audit_detail status'); + ok($detail_data->[0]->{detail} =~ /silly/, 'feed_audit_detail detail'); + ok(defined $detail_data->[0]->{time}, 'feed_audit_detail time defined'); + }; +}; + +runtests unless caller; From 93342430f6e93b2d4a2212cf64854754bc9ab971 Mon Sep 17 00:00:00 2001 From: "Brian \"Moses\" Hall" Date: Wed, 11 Mar 2026 12:52:38 -0400 Subject: [PATCH 08/15] Remove no longer needed `opendir` in truenas_audit.pl --- bin/audit/truenas_audit.pl | 3 --- 1 file changed, 3 deletions(-) diff --git a/bin/audit/truenas_audit.pl b/bin/audit/truenas_audit.pl index 4313e570..60f6c4c9 100755 --- a/bin/audit/truenas_audit.pl +++ b/bin/audit/truenas_audit.pl @@ -62,10 +62,8 @@ } my $base = shift @ARGV or die("Missing base directory.."); - my $iterator = HTFeed::RepositoryIterator->new($base); - while (my $obj = $iterator->next_object) { my $sdr_partition = $obj->{sdr_partition}; my $path = $obj->{path}; @@ -131,7 +129,6 @@ ); # does barcode have a zip & xml, and do they match? - opendir( my $dh, $path ); my $filecount = 0; my $found_zip = 0; From 6f5c0f666975103431f95a306eb7779020893168 Mon Sep 17 00:00:00 2001 From: "Brian \"Moses\" Hall" Date: Wed, 11 Mar 2026 13:03:24 -0400 Subject: [PATCH 09/15] Spurous file test just needs `touch` without worrying about writing file contents. --- t/truenas_audit.t | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/t/truenas_audit.t b/t/truenas_audit.t index b01b9d40..f58553b0 100644 --- a/t/truenas_audit.t +++ b/t/truenas_audit.t @@ -147,14 +147,12 @@ describe "bin/audit/main_repo_audit.pl" => sub { it "records a spurious file but ignores pre-uplift METS" => sub { my $storage_name = 's3-truenas-macc'; my $objid = 'test'; - # Add a silly file and a pre-uplift file + # Add a silly file and a pre-uplift file (can be empty, contents don't matter) my $pt_objid = s2ppchars($objid); my $pt_path = id2ppath($objid); foreach my $ext (('silly', 'pre_uplift.mets.xml')) { my $path = "/tmp/sdr1/obj/test/$pt_path$pt_objid/" . "$objid.$ext"; - open(my $fh, '>', $path) or die "open file $path failed: $!"; - print $fh "shwoozle\n"; - close($fh); + `touch $path`; } `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name /tmp/sdr1`; my $db_data = get_feed_storage_data('test', 'test', $storage_name); From 33f290aa9e7f87912f5b9a2a756dc4bf203eeeef Mon Sep 17 00:00:00 2001 From: "Brian \"Moses\" Hall" Date: Thu, 12 Mar 2026 10:33:37 -0400 Subject: [PATCH 10/15] Add comment and remove FIXME on question about checking `feed_audit.is_tombstoned` --- bin/audit/truenas_audit.pl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bin/audit/truenas_audit.pl b/bin/audit/truenas_audit.pl index 60f6c4c9..7cec3e80 100755 --- a/bin/audit/truenas_audit.pl +++ b/bin/audit/truenas_audit.pl @@ -23,8 +23,6 @@ use HTFeed::Volume; use HTFeed::VolumeValidator; - -# FIXME: is this needed? my $tombstone_check = "select is_tombstoned from feed_audit where namespace = ? and id = ?"; my $insert = @@ -263,6 +261,7 @@ sub execute_stmt { return $sth; } +# There are as of early 2026 still 13 is_tombstoned entries in feed_audit, so this check stays. sub is_tombstoned { my $namespace = shift; my $objid = shift; From 06a29c74dd33ca9cfc4df17f41829d7af1a2ee54 Mon Sep 17 00:00:00 2001 From: "Brian \"Moses\" Hall" Date: Thu, 12 Mar 2026 13:12:30 -0400 Subject: [PATCH 11/15] Put testing sdrX directory under `$tmpdirs->{tmpdir}` instead of `/tmp/sdr1` --- t/truenas_audit.t | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/t/truenas_audit.t b/t/truenas_audit.t index f58553b0..d906bdf3 100644 --- a/t/truenas_audit.t +++ b/t/truenas_audit.t @@ -53,6 +53,14 @@ describe "bin/audit/main_repo_audit.pl" => sub { return $data; } + # `RepositoryIterator` can infer its sdr partition when it isn't at the root of the + # filesystem. Hence we copy to a location where we can put "sdr1" in the path. + sub temp_sdr_path { + my $sdr_partition = shift || 1; + + return "$tmpdirs->{tmpdir}/sdr$sdr_partition"; + } + before each => sub { my $namespace = 'test'; my $objid = 'test'; @@ -62,18 +70,16 @@ describe "bin/audit/main_repo_audit.pl" => sub { $storage->move; my $pt_objid = s2ppchars($objid); my $pt_path = id2ppath($objid); - # truenas_audit.pl can infer its sdr partition when it isn't at the root of the - # filesystem but the `tmpdirs` random names will throw it off completely. Hence we - # copy to a location where we can put "sdr1" in the path. - File::Path::make_path('/tmp/sdr1/obj'); - `cp -r $tmpdirs->{obj_dir}/* /tmp/sdr1/obj/`; + my $temp_sdr_path = temp_sdr_path; + File::Path::make_path("$temp_sdr_path/obj"); + `cp -r $tmpdirs->{obj_dir}/* $temp_sdr_path/obj/`; # This is just conforming to `etc/config_test.yml` so Volume.pm can find the files. File::Path::make_path("/tmp/obj_link/test/$pt_path"); - `ln -s /tmp/sdr1/obj/test/$pt_path/$pt_objid /tmp/obj_link/test/$pt_path`; + `ln -s $temp_sdr_path/obj/test/$pt_path/$pt_objid /tmp/obj_link/test/$pt_path`; }; after each => sub { - File::Path::remove_tree('/tmp/sdr1'); + File::Path::remove_tree(temp_sdr_path); File::Path::remove_tree('/tmp/obj_link'); get_dbh->prepare('DELETE FROM feed_storage')->execute; get_dbh->prepare('DELETE FROM feed_audit_detail')->execute; @@ -81,7 +87,8 @@ describe "bin/audit/main_repo_audit.pl" => sub { foreach my $storage_name (('s3-truenas-macc', 's3-truenas-ictc')) { it "writes to feed_storage" => sub { - `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name /tmp/sdr1`; + my $temp_sdr_path = temp_sdr_path; + `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path`; my $db_data = get_feed_storage_data('test', 'test', $storage_name); is(scalar(@$db_data), 1, 'with only one initial entry'); is($db_data->[0]->{namespace}, 'test', 'correct namespace'); @@ -100,14 +107,15 @@ describe "bin/audit/main_repo_audit.pl" => sub { # If existing data, only `lastchecked` and `lastmd5check` will change # (file sizes will also be updated but with the same data). it "updates existing data" => sub { + my $temp_sdr_path = temp_sdr_path; my $storage_name = 's3-truenas-macc'; - `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name /tmp/sdr1`; + `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path`; my $db_data = get_feed_storage_data('test', 'test', $storage_name); is(scalar(@$db_data), 1, 'with only one initial entry'); my $old_lastchecked = $db_data->[0]->{lastchecked}; my $old_lastmd5check = $db_data->[0]->{lastmd5check}; sleep 1; - `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name /tmp/sdr1`; + `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path`; $db_data = get_feed_storage_data('test', 'test', $storage_name); my $new_lastchecked = $db_data->[0]->{lastchecked}; my $new_lastmd5check = $db_data->[0]->{lastmd5check}; @@ -117,16 +125,17 @@ describe "bin/audit/main_repo_audit.pl" => sub { }; it "records a failed MD5 check" => sub { + my $temp_sdr_path = temp_sdr_path; my $storage_name = 's3-truenas-macc'; my $objid = 'test'; # Fiddle with the zip my $pt_objid = s2ppchars($objid); my $pt_path = id2ppath($objid); - my $zip_path = "/tmp/sdr1/obj/test/$pt_path$pt_objid/" . "$objid.zip"; + my $zip_path = "$temp_sdr_path/obj/test/$pt_path$pt_objid/" . "$objid.zip"; open(my $fh, '>', $zip_path) or die "open zip file $zip_path failed: $!"; print $fh "shwoozle\n"; close($fh); - `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name /tmp/sdr1`; + `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path`; my $db_data = get_feed_storage_data('test', 'test', $storage_name); is(scalar(@$db_data), 1, 'with only one initial feed_storage entry'); ok(defined $db_data->[0]->{lastchecked}, 'defined lastchecked'); @@ -143,18 +152,19 @@ describe "bin/audit/main_repo_audit.pl" => sub { ok($detail_data->[0]->{detail} =~ /expected=/, 'feed_audit_detail detail'); ok(defined $detail_data->[0]->{time}, 'feed_audit_detail time defined'); }; - + it "records a spurious file but ignores pre-uplift METS" => sub { + my $temp_sdr_path = temp_sdr_path; my $storage_name = 's3-truenas-macc'; my $objid = 'test'; # Add a silly file and a pre-uplift file (can be empty, contents don't matter) my $pt_objid = s2ppchars($objid); my $pt_path = id2ppath($objid); foreach my $ext (('silly', 'pre_uplift.mets.xml')) { - my $path = "/tmp/sdr1/obj/test/$pt_path$pt_objid/" . "$objid.$ext"; + my $path = "$temp_sdr_path/obj/test/$pt_path$pt_objid/" . "$objid.$ext"; `touch $path`; } - `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name /tmp/sdr1`; + `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path`; my $db_data = get_feed_storage_data('test', 'test', $storage_name); is(scalar(@$db_data), 1, 'with only one feed_storage entry'); is($db_data->[0]->{md5check_ok}, 1, 'md5check_ok=1'); From b279f2927effc7be8242ee038ad53206c4905f22 Mon Sep 17 00:00:00 2001 From: "Brian \"Moses\" Hall" Date: Fri, 13 Mar 2026 13:32:12 -0400 Subject: [PATCH 12/15] - Enable symlink checks outside sdr1, and add tests for same - Refactor some of the path logic in `truenas_audit.t` --- bin/audit/truenas_audit.pl | 35 +++++----- t/truenas_audit.t | 134 ++++++++++++++++++++++++++++++------- 2 files changed, 130 insertions(+), 39 deletions(-) diff --git a/bin/audit/truenas_audit.pl b/bin/audit/truenas_audit.pl index 7cec3e80..8a80431b 100755 --- a/bin/audit/truenas_audit.pl +++ b/bin/audit/truenas_audit.pl @@ -61,9 +61,9 @@ my $base = shift @ARGV or die("Missing base directory.."); my $iterator = HTFeed::RepositoryIterator->new($base); +my $sdr_partition = $iterator->{sdr_partition}; while (my $obj = $iterator->next_object) { - my $sdr_partition = $obj->{sdr_partition}; my $path = $obj->{path}; my $namespace = $obj->{namespace}; my $objid = $obj->{objid}; @@ -101,20 +101,25 @@ my $last_touched = $zip_seconds; $last_touched = $mets_seconds if defined $mets_seconds and (not defined $zip_seconds or $mets_seconds > $zip_seconds); - # FIXME: I don't know if this is needed and if it is this is old code from main_repo_audit.pl so it needs fixin' - #test symlinks unless we're traversing sdr1 or the file is too new - # if ( $first_path ne 'sdr1' and (defined $last_touched and time - $last_touched >= 86400) ) { -# my $link_path = join( "/", "/sdr1", @pathcomp, $last_path ); -# my $link_target = readlink $link_path -# or set_status( $namespace, $objid, $path, "CANT_LSTAT", -# "$link_path $!" ); -# -# if ( defined $link_target and $link_target ne $path ) { -# set_status( $namespace, $objid, $path, "SYMLINK_INVALID", -# $link_target ); -# } -# -# } + # test symlinks unless we're traversing sdr1 + if ( $sdr_partition ne '1' ) { + my $volume = new HTFeed::Volume( + packagetype => "pkgtype", + namespace => $namespace, + objid => $objid + ); + my $link_path = $path; + $link_path =~ s/sdr$sdr_partition/sdr1/; + my $link_target = readlink $link_path + or set_status( $namespace, $objid, $storage_name, $path, "CANT_LSTAT", + "$link_path $!" ); + + if ( defined $link_target and $link_target ne $path ) { + set_status( $namespace, $objid, $storage_name, $path, "SYMLINK_INVALID", + $link_target || '' ); + } + + } #insert diff --git a/t/truenas_audit.t b/t/truenas_audit.t index d906bdf3..ce3978d6 100644 --- a/t/truenas_audit.t +++ b/t/truenas_audit.t @@ -1,12 +1,14 @@ -use Test::Spec; -use HTFeed::DBTools qw(get_dbh); -use HTFeed::Storage::LocalPairtree; +use strict; +use warnings; + use Data::Dumper; use File::Copy; use File::Pairtree qw(id2ppath s2ppchars); +use File::Spec; +use Test::Spec; -use strict; -use warnings; +use HTFeed::DBTools qw(get_dbh); +use HTFeed::Storage::LocalPairtree; describe "bin/audit/main_repo_audit.pl" => sub { spec_helper 'storage_helper.pl'; @@ -35,7 +37,9 @@ describe "bin/audit/main_repo_audit.pl" => sub { my $sql = 'SELECT * FROM feed_storage WHERE namespace=? AND id=? AND storage_name=?'; my $sth = get_dbh()->prepare($sql); $sth->execute($namespace, $objid, $storage_name); - push(@$data, $sth->fetchrow_hashref); + while (my $row = $sth->fetchrow_hashref) { + push(@$data, $row); + } return $data; } @@ -49,16 +53,72 @@ describe "bin/audit/main_repo_audit.pl" => sub { my $sql = 'SELECT * FROM feed_audit_detail WHERE namespace=? AND id=? AND storage_name=?'; my $sth = get_dbh()->prepare($sql); $sth->execute($namespace, $objid, $storage_name); - push(@$data, $sth->fetchrow_hashref); + while (my $row = $sth->fetchrow_hashref) { + push(@$data, $row); + } return $data; } # `RepositoryIterator` can infer its sdr partition when it isn't at the root of the - # filesystem. Hence we copy to a location where we can put "sdr1" in the path. + # filesystem but it does need an "sdrX" directory _somewhere_ in the path. We can't use + # `$tmpdirs->{obj_dir}` by itself. sub temp_sdr_path { my $sdr_partition = shift || 1; - return "$tmpdirs->{tmpdir}/sdr$sdr_partition"; + return File::Spec->catfile($tmpdirs->{tmpdir}, "sdr$sdr_partition"); + } + + sub temp_sdr_obj_path { + my $sdr_partition = shift || 1; + my $namespace = shift || 'test'; + my $objid = shift || 'test'; + + return File::Spec->catfile( + temp_sdr_path($sdr_partition), + 'obj', + $namespace, + id2ppath($objid), + s2ppchars($objid) + ); + } + + sub temp_link_path { + my $namespace = shift || 'test'; + my $objid = shift || 'test'; + + return File::Spec->catfile( + File::Spec->rootdir, + 'tmp', + 'obj_link', + $namespace, + id2ppath($objid), + s2ppchars($objid) + ); + } + + # Set up sdr1 and sdr2 directories with the appropriate linkage from latter to former. + # Copy contents from `$tempdirs->{obj_dir}` into a local sdr2 so `RepositoryIterator` has + # the proprioceptive stimulus (i.e., a directory named "sdr2" somewhere in the path) it needs. + sub make_test_directories { + my $namespace = shift; + my $objid = shift; + my $sdr2_path = temp_sdr_path(2); + my $sdr1_obj_path = temp_sdr_obj_path(1); + my $sdr2_obj_path = temp_sdr_obj_path(2); + my $temp_link_path = temp_link_path; + + File::Path::make_path("$sdr2_obj_path"); + `cp -r $tmpdirs->{obj_dir}/* $sdr2_path/obj/`; + # Symlink into obj_link so Volume.pm can find the files, + # and into sdr1 for symlink checks inside truenas_audit.pl + # Create directory structures but remove the leaf node so we can recreate it as a symlink. + # This is kind of silly but trying to create a partial path would be messier. + File::Path::make_path($temp_link_path); + File::Path::remove_tree($temp_link_path); + File::Path::make_path($sdr1_obj_path); + File::Path::remove_tree($sdr1_obj_path); + `ln -sf $sdr2_obj_path $temp_link_path`; + `ln -sf $sdr2_obj_path $sdr1_obj_path`; } before each => sub { @@ -68,18 +128,12 @@ describe "bin/audit/main_repo_audit.pl" => sub { $storage->stage; $storage->make_object_path; $storage->move; - my $pt_objid = s2ppchars($objid); - my $pt_path = id2ppath($objid); - my $temp_sdr_path = temp_sdr_path; - File::Path::make_path("$temp_sdr_path/obj"); - `cp -r $tmpdirs->{obj_dir}/* $temp_sdr_path/obj/`; - # This is just conforming to `etc/config_test.yml` so Volume.pm can find the files. - File::Path::make_path("/tmp/obj_link/test/$pt_path"); - `ln -s $temp_sdr_path/obj/test/$pt_path/$pt_objid /tmp/obj_link/test/$pt_path`; + make_test_directories($namespace, $objid); }; after each => sub { File::Path::remove_tree(temp_sdr_path); + File::Path::remove_tree(temp_sdr_path(2)); File::Path::remove_tree('/tmp/obj_link'); get_dbh->prepare('DELETE FROM feed_storage')->execute; get_dbh->prepare('DELETE FROM feed_audit_detail')->execute; @@ -128,10 +182,8 @@ describe "bin/audit/main_repo_audit.pl" => sub { my $temp_sdr_path = temp_sdr_path; my $storage_name = 's3-truenas-macc'; my $objid = 'test'; - # Fiddle with the zip - my $pt_objid = s2ppchars($objid); - my $pt_path = id2ppath($objid); - my $zip_path = "$temp_sdr_path/obj/test/$pt_path$pt_objid/" . "$objid.zip"; + # Replace the zip with garbage + my $zip_path = File::Spec->catfile(temp_sdr_obj_path, "$objid.zip"); open(my $fh, '>', $zip_path) or die "open zip file $zip_path failed: $!"; print $fh "shwoozle\n"; close($fh); @@ -158,10 +210,8 @@ describe "bin/audit/main_repo_audit.pl" => sub { my $storage_name = 's3-truenas-macc'; my $objid = 'test'; # Add a silly file and a pre-uplift file (can be empty, contents don't matter) - my $pt_objid = s2ppchars($objid); - my $pt_path = id2ppath($objid); foreach my $ext (('silly', 'pre_uplift.mets.xml')) { - my $path = "$temp_sdr_path/obj/test/$pt_path$pt_objid/" . "$objid.$ext"; + my $path = File::Spec->catfile(temp_sdr_obj_path, "$objid.$ext"); `touch $path`; } `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path`; @@ -178,6 +228,42 @@ describe "bin/audit/main_repo_audit.pl" => sub { ok($detail_data->[0]->{detail} =~ /silly/, 'feed_audit_detail detail'); ok(defined $detail_data->[0]->{time}, 'feed_audit_detail time defined'); }; + + # For symlink checks we use sdr2 so the symlinks in sdr1 can be verified to point to + # the right place in sdr2. + it "checks symlinks" => sub { + my $temp_sdr_path = temp_sdr_path(2); + my $storage_name = 's3-truenas-macc'; + `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path`; + my $db_data = get_feed_storage_data('test', 'test', $storage_name); + is(scalar(@$db_data), 1, 'with feed_storage entry'); + my $detail_data = get_feed_audit_detail_data('test', 'test', $storage_name); + is(scalar(@$detail_data), 0, 'with no feed_audit_detail entries'); + }; + + it "detects bad symlinks" => sub { + my $temp_sdr_path = temp_sdr_path(2); + my $storage_name = 's3-truenas-macc'; + + # Remove the symlink on sdr1 and replace it with a link to somewhere else + my $sdr1_link_location = temp_sdr_obj_path; + # "Somewhere else" is /dev/null + # Create a symlink clobbering the existing one without following it + `ln -sfn /dev/null $sdr1_link_location`; + + `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path`; + my $db_data = get_feed_storage_data('test', 'test', $storage_name); + is(scalar(@$db_data), 1, 'with feed_storage entry'); + my $detail_data = get_feed_audit_detail_data('test', 'test', $storage_name); + is(scalar(@$detail_data), 1, 'with one feed_audit_detail entry'); + is($detail_data->[0]->{namespace}, 'test', 'feed_audit_detail namespace'); + is($detail_data->[0]->{id}, 'test', 'feed_audit_detail id'); + is($detail_data->[0]->{storage_name}, $storage_name, 'feed_audit_detail storage_name'); + ok($detail_data->[0]->{path} =~ /sdr2/, 'feed_audit_detail path implicates sdr2'); + is($detail_data->[0]->{status}, 'SYMLINK_INVALID', 'feed_audit_detail status'); + ok($detail_data->[0]->{detail} =~ /null/, 'feed_audit_detail detail'); + ok(defined $detail_data->[0]->{time}, 'feed_audit_detail time defined'); + }; }; runtests unless caller; From fad9af74c1622ae2088b30683984c295aa1a131c Mon Sep 17 00:00:00 2001 From: "Brian \"Moses\" Hall" Date: Fri, 13 Mar 2026 13:38:05 -0400 Subject: [PATCH 13/15] Remove two unused subroutines from main_repo_audit but unneeded here --- bin/audit/truenas_audit.pl | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/bin/audit/truenas_audit.pl b/bin/audit/truenas_audit.pl index 8a80431b..8eff7f91 100755 --- a/bin/audit/truenas_audit.pl +++ b/bin/audit/truenas_audit.pl @@ -278,25 +278,4 @@ sub is_tombstoned { } } -sub recently_modified_path { - my $path = shift; - - my $mtime = ( stat($path) )[9]; - my $mtime_age = time() - $mtime; - - return 1 if $mtime_age < (86400 * 2); -} - -sub recent_previous_version { - my $file = shift; - - return unless $file =~ /.old$/; - - my $ctime = ( stat($file) )[10]; - my $ctime_age = time() - $ctime; - - return 1 if $ctime_age < (86400 * 2); - -} - __END__ From 431c359b23bd44d1295805effee6f8d8ce7f5fbb Mon Sep 17 00:00:00 2001 From: "Brian \"Moses\" Hall" Date: Fri, 13 Mar 2026 13:45:13 -0400 Subject: [PATCH 14/15] Replace backticks with system() --- t/truenas_audit.t | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/t/truenas_audit.t b/t/truenas_audit.t index ce3978d6..ad13b64c 100644 --- a/t/truenas_audit.t +++ b/t/truenas_audit.t @@ -108,7 +108,7 @@ describe "bin/audit/main_repo_audit.pl" => sub { my $temp_link_path = temp_link_path; File::Path::make_path("$sdr2_obj_path"); - `cp -r $tmpdirs->{obj_dir}/* $sdr2_path/obj/`; + system("cp -r $tmpdirs->{obj_dir}/* $sdr2_path/obj/"); # Symlink into obj_link so Volume.pm can find the files, # and into sdr1 for symlink checks inside truenas_audit.pl # Create directory structures but remove the leaf node so we can recreate it as a symlink. @@ -117,8 +117,8 @@ describe "bin/audit/main_repo_audit.pl" => sub { File::Path::remove_tree($temp_link_path); File::Path::make_path($sdr1_obj_path); File::Path::remove_tree($sdr1_obj_path); - `ln -sf $sdr2_obj_path $temp_link_path`; - `ln -sf $sdr2_obj_path $sdr1_obj_path`; + system("ln -sf $sdr2_obj_path $temp_link_path"); + system("ln -sf $sdr2_obj_path $sdr1_obj_path"); } before each => sub { @@ -142,7 +142,7 @@ describe "bin/audit/main_repo_audit.pl" => sub { foreach my $storage_name (('s3-truenas-macc', 's3-truenas-ictc')) { it "writes to feed_storage" => sub { my $temp_sdr_path = temp_sdr_path; - `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path`; + system("bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path"); my $db_data = get_feed_storage_data('test', 'test', $storage_name); is(scalar(@$db_data), 1, 'with only one initial entry'); is($db_data->[0]->{namespace}, 'test', 'correct namespace'); @@ -163,13 +163,13 @@ describe "bin/audit/main_repo_audit.pl" => sub { it "updates existing data" => sub { my $temp_sdr_path = temp_sdr_path; my $storage_name = 's3-truenas-macc'; - `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path`; + system("bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path"); my $db_data = get_feed_storage_data('test', 'test', $storage_name); is(scalar(@$db_data), 1, 'with only one initial entry'); my $old_lastchecked = $db_data->[0]->{lastchecked}; my $old_lastmd5check = $db_data->[0]->{lastmd5check}; sleep 1; - `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path`; + system("bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path"); $db_data = get_feed_storage_data('test', 'test', $storage_name); my $new_lastchecked = $db_data->[0]->{lastchecked}; my $new_lastmd5check = $db_data->[0]->{lastmd5check}; @@ -187,7 +187,7 @@ describe "bin/audit/main_repo_audit.pl" => sub { open(my $fh, '>', $zip_path) or die "open zip file $zip_path failed: $!"; print $fh "shwoozle\n"; close($fh); - `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path`; + system("bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path"); my $db_data = get_feed_storage_data('test', 'test', $storage_name); is(scalar(@$db_data), 1, 'with only one initial feed_storage entry'); ok(defined $db_data->[0]->{lastchecked}, 'defined lastchecked'); @@ -212,9 +212,9 @@ describe "bin/audit/main_repo_audit.pl" => sub { # Add a silly file and a pre-uplift file (can be empty, contents don't matter) foreach my $ext (('silly', 'pre_uplift.mets.xml')) { my $path = File::Spec->catfile(temp_sdr_obj_path, "$objid.$ext"); - `touch $path`; + system("touch $path"); } - `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path`; + system("bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path"); my $db_data = get_feed_storage_data('test', 'test', $storage_name); is(scalar(@$db_data), 1, 'with only one feed_storage entry'); is($db_data->[0]->{md5check_ok}, 1, 'md5check_ok=1'); @@ -234,7 +234,7 @@ describe "bin/audit/main_repo_audit.pl" => sub { it "checks symlinks" => sub { my $temp_sdr_path = temp_sdr_path(2); my $storage_name = 's3-truenas-macc'; - `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path`; + system("bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path"); my $db_data = get_feed_storage_data('test', 'test', $storage_name); is(scalar(@$db_data), 1, 'with feed_storage entry'); my $detail_data = get_feed_audit_detail_data('test', 'test', $storage_name); @@ -249,9 +249,9 @@ describe "bin/audit/main_repo_audit.pl" => sub { my $sdr1_link_location = temp_sdr_obj_path; # "Somewhere else" is /dev/null # Create a symlink clobbering the existing one without following it - `ln -sfn /dev/null $sdr1_link_location`; + system("ln -sfn /dev/null $sdr1_link_location"); - `bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path`; + system("bin/audit/truenas_audit.pl --md5 --storage_name $storage_name $temp_sdr_path"); my $db_data = get_feed_storage_data('test', 'test', $storage_name); is(scalar(@$db_data), 1, 'with feed_storage entry'); my $detail_data = get_feed_audit_detail_data('test', 'test', $storage_name); From 43bf649d69b81fe70eee2b80724a241a2126f742 Mon Sep 17 00:00:00 2001 From: "Brian \"Moses\" Hall" Date: Fri, 13 Mar 2026 14:21:34 -0400 Subject: [PATCH 15/15] Update record_audit logic to bail out if storage name is not set --- lib/HTFeed/Storage/LocalPairtree.pm | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lib/HTFeed/Storage/LocalPairtree.pm b/lib/HTFeed/Storage/LocalPairtree.pm index 3e55646b..4be518a2 100644 --- a/lib/HTFeed/Storage/LocalPairtree.pm +++ b/lib/HTFeed/Storage/LocalPairtree.pm @@ -144,9 +144,7 @@ sub record_audit { # TODO populate image_size, page_count - # FIXME: is this right?? should we force it to one of {s3-truenas-ictc, s3-truenas-macc}? - my $storage_name = $self->{storage_name} || 'LocalPairtree'; - + return unless defined $self->{name}; my $zipsize = $self->zip_size; my $zipdate = $self->file_date($self->zip_obj_path); @@ -154,8 +152,7 @@ sub record_audit { my $metsdate = $self->file_date($self->mets_obj_path); my $sth = get_dbh()->prepare($stmt); my $res = $sth->execute( - $self->{namespace}, $self->{objid}, - $storage_name, + $self->{namespace}, $self->{objid}, $self->{name}, $sdr_partition, $zipsize, $zipdate, $metssize, $metsdate, # duplicate parameters for duplicate key update $sdr_partition, $zipsize, $zipdate, $metssize, $metsdate