From 31c6ffec4120eae245efc9148efe15b0545b5e20 Mon Sep 17 00:00:00 2001 From: bzantium Date: Sun, 5 Apr 2026 01:10:02 +0900 Subject: [PATCH 1/2] Use targeted glob for cloud logdir subdirectory discovery Replace the level-by-level globbing approach for cloud filesystems (GCS, S3) with a single recursive glob for *tfevents* files. The previous method listed all files at every directory level, which is extremely slow when the directory tree contains many non-event files such as model checkpoints. For a test case with ~26,000 checkpoint files alongside 8 event files, this reduces discovery time from ~100s to ~13s. Local filesystem paths are unaffected; they continue to use the walk-based approach. --- .../backend/event_processing/io_wrapper.py | 37 ++++++++++++------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/tensorboard/backend/event_processing/io_wrapper.py b/tensorboard/backend/event_processing/io_wrapper.py index 1e23eb14005..595739a2840 100644 --- a/tensorboard/backend/event_processing/io_wrapper.py +++ b/tensorboard/backend/event_processing/io_wrapper.py @@ -203,22 +203,33 @@ def GetLogdirSubdirectories(path): ) if io_util.IsCloudPath(path): - # Glob-ing for files can be significantly faster than recursively - # walking through directories for some file systems. + # For cloud filesystems, use a single targeted recursive glob for + # event files instead of listing all files level by level. This is + # significantly faster when the directory tree contains many + # non-event files (e.g., model checkpoints). logger.info( - "GetLogdirSubdirectories: Starting to list directories via glob-ing." + "GetLogdirSubdirectories: Starting to find event files via targeted glob." ) - traversal_method = ListRecursivelyViaGlobbing + escaped = _EscapeGlobCharacters(path) + glob_pattern = escaped + "/**/*tfevents*" + event_files = tf.io.gfile.glob(glob_pattern) + logger.info( + "GetLogdirSubdirectories: Found %d event files via glob.", + len(event_files), + ) + dirs = set() + for f in event_files: + if IsTensorFlowEventsFile(f): + dirs.add(os.path.dirname(f)) + return tuple(dirs) else: - # For other file systems, the glob-ing based method might be slower because - # each call to glob could involve performing a recursive walk. + # For local file systems, walking is more efficient because each + # glob call could itself involve a recursive walk. logger.info( "GetLogdirSubdirectories: Starting to list directories via walking." ) - traversal_method = ListRecursivelyViaWalking - - return ( - subdir - for (subdir, files) in traversal_method(path) - if any(IsTensorFlowEventsFile(f) for f in files) - ) + return ( + subdir + for (subdir, files) in ListRecursivelyViaWalking(path) + if any(IsTensorFlowEventsFile(f) for f in files) + ) From d2f31ace7aea287d55789163ca5c73ef946a1bdf Mon Sep 17 00:00:00 2001 From: bzantium Date: Sun, 5 Apr 2026 10:27:16 +0900 Subject: [PATCH 2/2] Fix cloud event glob false negatives --- .../backend/event_processing/io_wrapper.py | 57 +++++++++---- .../event_processing/io_wrapper_test.py | 83 +++++++++++++++++++ 2 files changed, 125 insertions(+), 15 deletions(-) diff --git a/tensorboard/backend/event_processing/io_wrapper.py b/tensorboard/backend/event_processing/io_wrapper.py index 595739a2840..ab520ace98d 100644 --- a/tensorboard/backend/event_processing/io_wrapper.py +++ b/tensorboard/backend/event_processing/io_wrapper.py @@ -176,6 +176,44 @@ def ListRecursivelyViaWalking(top): ) +def _GetLogdirSubdirectoriesViaCloudEventGlobbing(path): + """Finds event-file directories in cloud logdirs via targeted globbing. + + Some TensorFlow `tf.io.gfile.glob` backends return false negatives for + `**/*tfevents*` even when deeper descendants contain matching event files. + Try a second recursive pattern before falling back to the legacy globbing + traversal, which is slower but has the expected behavior. + """ + escaped = _EscapeGlobCharacters(path) + glob_patterns = ( + escaped + "/**/*tfevents*", + escaped + "/**/**/*tfevents*", + ) + dirs = set() + for glob_pattern in glob_patterns: + event_files = tf.io.gfile.glob(glob_pattern) + logger.info( + "GetLogdirSubdirectories: Found %d event files via glob %r.", + len(event_files), + glob_pattern, + ) + for f in event_files: + if IsTensorFlowEventsFile(f): + dirs.add(os.path.dirname(f)) + if dirs: + return tuple(dirs) + + logger.info( + "GetLogdirSubdirectories: Targeted globbing found no event files; " + "falling back to legacy cloud globbing." + ) + return tuple( + subdir + for (subdir, files) in ListRecursivelyViaGlobbing(path) + if any(IsTensorFlowEventsFile(f) for f in files) + ) + + def GetLogdirSubdirectories(path): """Obtains all subdirectories with events files. @@ -204,24 +242,13 @@ def GetLogdirSubdirectories(path): if io_util.IsCloudPath(path): # For cloud filesystems, use a single targeted recursive glob for - # event files instead of listing all files level by level. This is - # significantly faster when the directory tree contains many - # non-event files (e.g., model checkpoints). + # event files instead of listing all files level by level. If the + # backend returns a false negative for the fast path, fall back to the + # legacy globbing traversal for correctness. logger.info( "GetLogdirSubdirectories: Starting to find event files via targeted glob." ) - escaped = _EscapeGlobCharacters(path) - glob_pattern = escaped + "/**/*tfevents*" - event_files = tf.io.gfile.glob(glob_pattern) - logger.info( - "GetLogdirSubdirectories: Found %d event files via glob.", - len(event_files), - ) - dirs = set() - for f in event_files: - if IsTensorFlowEventsFile(f): - dirs.add(os.path.dirname(f)) - return tuple(dirs) + return _GetLogdirSubdirectoriesViaCloudEventGlobbing(path) else: # For local file systems, walking is more efficient because each # glob call could itself involve a recursive walk. diff --git a/tensorboard/backend/event_processing/io_wrapper_test.py b/tensorboard/backend/event_processing/io_wrapper_test.py index b76e136829f..83448af189a 100644 --- a/tensorboard/backend/event_processing/io_wrapper_test.py +++ b/tensorboard/backend/event_processing/io_wrapper_test.py @@ -325,6 +325,89 @@ def testGetLogdirSubdirectories(self): io_wrapper.GetLogdirSubdirectories(temp_dir), ) + def testGetLogdirSubdirectoriesCloudUsesSecondTargetedPattern(self): + path = "gs://bucket/logdir" + self.stubs.Set(io_wrapper.tf.io.gfile, "exists", lambda _: True) + self.stubs.Set(io_wrapper.tf.io.gfile, "isdir", lambda _: True) + self.stubs.Set(io_wrapper.io_util, "IsCloudPath", lambda _: True) + + event_files = [ + "gs://bucket/logdir/run1/tensorboard/events.out.tfevents.1", + "gs://bucket/logdir/run2/tensorboard/events.out.tfevents.2", + ] + glob_calls = [] + expected_first = "gs://bucket/logdir/**/*tfevents*" + expected_second = "gs://bucket/logdir/**/**/*tfevents*" + + def fake_glob(pattern): + glob_calls.append(pattern) + if pattern == expected_first: + return [] + if pattern == expected_second: + return event_files + self.fail("unexpected glob pattern: %r" % pattern) + + self.stubs.Set(io_wrapper.tf.io.gfile, "glob", fake_glob) + self.stubs.Set( + io_wrapper, + "ListRecursivelyViaGlobbing", + lambda _: self.fail("legacy fallback should not run"), + ) + + self.assertCountEqual( + [ + "gs://bucket/logdir/run1/tensorboard", + "gs://bucket/logdir/run2/tensorboard", + ], + io_wrapper.GetLogdirSubdirectories(path), + ) + self.assertEqual( + [expected_first, expected_second], + glob_calls, + ) + + def testGetLogdirSubdirectoriesCloudFallsBackToLegacyGlobbing(self): + path = "gs://bucket/logdir" + self.stubs.Set(io_wrapper.tf.io.gfile, "exists", lambda _: True) + self.stubs.Set(io_wrapper.tf.io.gfile, "isdir", lambda _: True) + self.stubs.Set(io_wrapper.io_util, "IsCloudPath", lambda _: True) + self.stubs.Set(io_wrapper.tf.io.gfile, "glob", lambda _: []) + + def legacy_listing(_): + return iter( + [ + ( + "gs://bucket/logdir/run1/tensorboard", + ( + "gs://bucket/logdir/run1/tensorboard/events.out.tfevents.1", + ), + ), + ( + "gs://bucket/logdir/run2/tensorboard", + ( + "gs://bucket/logdir/run2/tensorboard/model.ckpt", + "gs://bucket/logdir/run2/tensorboard/events.out.tfevents.2", + ), + ), + ( + "gs://bucket/logdir/run3", + ("gs://bucket/logdir/run3/model.ckpt",), + ), + ] + ) + + self.stubs.Set( + io_wrapper, "ListRecursivelyViaGlobbing", legacy_listing + ) + + self.assertCountEqual( + [ + "gs://bucket/logdir/run1/tensorboard", + "gs://bucket/logdir/run2/tensorboard", + ], + io_wrapper.GetLogdirSubdirectories(path), + ) + def _CreateDeepDirectoryStructure(self, top_directory): """Creates a reasonable deep structure of subdirectories with files.