diff --git a/tensorboard/backend/event_processing/io_wrapper.py b/tensorboard/backend/event_processing/io_wrapper.py index 1e23eb14005..ab520ace98d 100644 --- a/tensorboard/backend/event_processing/io_wrapper.py +++ b/tensorboard/backend/event_processing/io_wrapper.py @@ -176,6 +176,44 @@ def ListRecursivelyViaWalking(top): ) +def _GetLogdirSubdirectoriesViaCloudEventGlobbing(path): + """Finds event-file directories in cloud logdirs via targeted globbing. + + Some TensorFlow `tf.io.gfile.glob` backends return false negatives for + `**/*tfevents*` even when deeper descendants contain matching event files. + Try a second recursive pattern before falling back to the legacy globbing + traversal, which is slower but has the expected behavior. + """ + escaped = _EscapeGlobCharacters(path) + glob_patterns = ( + escaped + "/**/*tfevents*", + escaped + "/**/**/*tfevents*", + ) + dirs = set() + for glob_pattern in glob_patterns: + event_files = tf.io.gfile.glob(glob_pattern) + logger.info( + "GetLogdirSubdirectories: Found %d event files via glob %r.", + len(event_files), + glob_pattern, + ) + for f in event_files: + if IsTensorFlowEventsFile(f): + dirs.add(os.path.dirname(f)) + if dirs: + return tuple(dirs) + + logger.info( + "GetLogdirSubdirectories: Targeted globbing found no event files; " + "falling back to legacy cloud globbing." + ) + return tuple( + subdir + for (subdir, files) in ListRecursivelyViaGlobbing(path) + if any(IsTensorFlowEventsFile(f) for f in files) + ) + + def GetLogdirSubdirectories(path): """Obtains all subdirectories with events files. @@ -203,22 +241,22 @@ def GetLogdirSubdirectories(path): ) if io_util.IsCloudPath(path): - # Glob-ing for files can be significantly faster than recursively - # walking through directories for some file systems. + # For cloud filesystems, use a single targeted recursive glob for + # event files instead of listing all files level by level. If the + # backend returns a false negative for the fast path, fall back to the + # legacy globbing traversal for correctness. logger.info( - "GetLogdirSubdirectories: Starting to list directories via glob-ing." + "GetLogdirSubdirectories: Starting to find event files via targeted glob." ) - traversal_method = ListRecursivelyViaGlobbing + return _GetLogdirSubdirectoriesViaCloudEventGlobbing(path) else: - # For other file systems, the glob-ing based method might be slower because - # each call to glob could involve performing a recursive walk. + # For local file systems, walking is more efficient because each + # glob call could itself involve a recursive walk. logger.info( "GetLogdirSubdirectories: Starting to list directories via walking." ) - traversal_method = ListRecursivelyViaWalking - - return ( - subdir - for (subdir, files) in traversal_method(path) - if any(IsTensorFlowEventsFile(f) for f in files) - ) + return ( + subdir + for (subdir, files) in ListRecursivelyViaWalking(path) + if any(IsTensorFlowEventsFile(f) for f in files) + ) diff --git a/tensorboard/backend/event_processing/io_wrapper_test.py b/tensorboard/backend/event_processing/io_wrapper_test.py index b76e136829f..83448af189a 100644 --- a/tensorboard/backend/event_processing/io_wrapper_test.py +++ b/tensorboard/backend/event_processing/io_wrapper_test.py @@ -325,6 +325,89 @@ def testGetLogdirSubdirectories(self): io_wrapper.GetLogdirSubdirectories(temp_dir), ) + def testGetLogdirSubdirectoriesCloudUsesSecondTargetedPattern(self): + path = "gs://bucket/logdir" + self.stubs.Set(io_wrapper.tf.io.gfile, "exists", lambda _: True) + self.stubs.Set(io_wrapper.tf.io.gfile, "isdir", lambda _: True) + self.stubs.Set(io_wrapper.io_util, "IsCloudPath", lambda _: True) + + event_files = [ + "gs://bucket/logdir/run1/tensorboard/events.out.tfevents.1", + "gs://bucket/logdir/run2/tensorboard/events.out.tfevents.2", + ] + glob_calls = [] + expected_first = "gs://bucket/logdir/**/*tfevents*" + expected_second = "gs://bucket/logdir/**/**/*tfevents*" + + def fake_glob(pattern): + glob_calls.append(pattern) + if pattern == expected_first: + return [] + if pattern == expected_second: + return event_files + self.fail("unexpected glob pattern: %r" % pattern) + + self.stubs.Set(io_wrapper.tf.io.gfile, "glob", fake_glob) + self.stubs.Set( + io_wrapper, + "ListRecursivelyViaGlobbing", + lambda _: self.fail("legacy fallback should not run"), + ) + + self.assertCountEqual( + [ + "gs://bucket/logdir/run1/tensorboard", + "gs://bucket/logdir/run2/tensorboard", + ], + io_wrapper.GetLogdirSubdirectories(path), + ) + self.assertEqual( + [expected_first, expected_second], + glob_calls, + ) + + def testGetLogdirSubdirectoriesCloudFallsBackToLegacyGlobbing(self): + path = "gs://bucket/logdir" + self.stubs.Set(io_wrapper.tf.io.gfile, "exists", lambda _: True) + self.stubs.Set(io_wrapper.tf.io.gfile, "isdir", lambda _: True) + self.stubs.Set(io_wrapper.io_util, "IsCloudPath", lambda _: True) + self.stubs.Set(io_wrapper.tf.io.gfile, "glob", lambda _: []) + + def legacy_listing(_): + return iter( + [ + ( + "gs://bucket/logdir/run1/tensorboard", + ( + "gs://bucket/logdir/run1/tensorboard/events.out.tfevents.1", + ), + ), + ( + "gs://bucket/logdir/run2/tensorboard", + ( + "gs://bucket/logdir/run2/tensorboard/model.ckpt", + "gs://bucket/logdir/run2/tensorboard/events.out.tfevents.2", + ), + ), + ( + "gs://bucket/logdir/run3", + ("gs://bucket/logdir/run3/model.ckpt",), + ), + ] + ) + + self.stubs.Set( + io_wrapper, "ListRecursivelyViaGlobbing", legacy_listing + ) + + self.assertCountEqual( + [ + "gs://bucket/logdir/run1/tensorboard", + "gs://bucket/logdir/run2/tensorboard", + ], + io_wrapper.GetLogdirSubdirectories(path), + ) + def _CreateDeepDirectoryStructure(self, top_directory): """Creates a reasonable deep structure of subdirectories with files.