From 2c78489b70e76f9af85d96024fe48cab0a36e353 Mon Sep 17 00:00:00 2001 From: bzantium Date: Sun, 5 Apr 2026 00:48:36 +0900 Subject: [PATCH] Handle exceptions in data reload loop to prevent silent data staleness The Reloader thread/process in LocalDataIngester crashes on any unhandled exception (e.g. transient network errors when reading from remote filesystems like GCS). Once the reload loop dies, TensorBoard continues serving stale data with no indication to the user. Wrap the reload loop body in a try/except so that transient errors are logged and the next reload cycle proceeds normally. --- .../backend/event_processing/data_ingester.py | 30 +++++++++------- .../event_processing/data_ingester_test.py | 34 +++++++++++++++++++ 2 files changed, 52 insertions(+), 12 deletions(-) diff --git a/tensorboard/backend/event_processing/data_ingester.py b/tensorboard/backend/event_processing/data_ingester.py index 52ec9e3275f..3c5f4e576a8 100644 --- a/tensorboard/backend/event_processing/data_ingester.py +++ b/tensorboard/backend/event_processing/data_ingester.py @@ -98,18 +98,24 @@ def start(self): def _reload(): while True: - start = time.time() - logger.info("TensorBoard reload process beginning") - for path, name in self._path_to_run.items(): - self._multiplexer.AddRunsFromDirectory(path, name) - logger.info( - "TensorBoard reload process: Reload the whole Multiplexer" - ) - self._multiplexer.Reload() - duration = time.time() - start - logger.info( - "TensorBoard done reloading. Load took %0.3f secs", duration - ) + try: + start = time.time() + logger.info("TensorBoard reload process beginning") + for path, name in self._path_to_run.items(): + self._multiplexer.AddRunsFromDirectory(path, name) + logger.info( + "TensorBoard reload process: Reload the whole Multiplexer" + ) + self._multiplexer.Reload() + duration = time.time() - start + logger.info( + "TensorBoard done reloading. Load took %0.3f secs", + duration, + ) + except Exception: + logger.error( + "TensorBoard reload failed", exc_info=True + ) if self._reload_interval == 0: # Only load the multiplexer once. Do not continuously reload. break diff --git a/tensorboard/backend/event_processing/data_ingester_test.py b/tensorboard/backend/event_processing/data_ingester_test.py index 8a16963f20e..12fe564450e 100644 --- a/tensorboard/backend/event_processing/data_ingester_test.py +++ b/tensorboard/backend/event_processing/data_ingester_test.py @@ -61,6 +61,40 @@ def __init__( self.window_title = window_title +class ReloadErrorHandlingTest(tb_test.TestCase): + """Tests that the reload loop survives transient errors.""" + + def test_reload_continues_after_exception(self): + """Reload loop should log errors and continue, not crash.""" + flags = FakeFlags(logdir="logdir", reload_interval=0, reload_task="blocking") + ingester = data_ingester.LocalDataIngester(flags) + # Make AddRunsFromDirectory raise on the first call. + with mock.patch.object( + ingester._multiplexer, "AddRunsFromDirectory", side_effect=OSError("network error") + ) as mock_add: + with mock.patch.object( + ingester._multiplexer, "Reload" + ) as mock_reload: + # Should not raise despite the OSError. + ingester.start() + mock_add.assert_called_once() + # Reload should not be called since AddRunsFromDirectory raised first. + mock_reload.assert_not_called() + + def test_reload_continues_after_reload_exception(self): + """Reload loop should survive errors from Reload() as well.""" + flags = FakeFlags(logdir="logdir", reload_interval=0, reload_task="blocking") + ingester = data_ingester.LocalDataIngester(flags) + with mock.patch.object( + ingester._multiplexer, "AddRunsFromDirectory" + ): + with mock.patch.object( + ingester._multiplexer, "Reload", side_effect=RuntimeError("reload failed") + ): + # Should not raise despite the RuntimeError. + ingester.start() + + class GetEventFileActiveFilterTest(tb_test.TestCase): def testDisabled(self): flags = FakeFlags(logdir="logdir", reload_multifile=False)