diff --git a/tensorboard/backend/event_processing/data_ingester.py b/tensorboard/backend/event_processing/data_ingester.py index 52ec9e3275f..3c5f4e576a8 100644 --- a/tensorboard/backend/event_processing/data_ingester.py +++ b/tensorboard/backend/event_processing/data_ingester.py @@ -98,18 +98,24 @@ def start(self): def _reload(): while True: - start = time.time() - logger.info("TensorBoard reload process beginning") - for path, name in self._path_to_run.items(): - self._multiplexer.AddRunsFromDirectory(path, name) - logger.info( - "TensorBoard reload process: Reload the whole Multiplexer" - ) - self._multiplexer.Reload() - duration = time.time() - start - logger.info( - "TensorBoard done reloading. Load took %0.3f secs", duration - ) + try: + start = time.time() + logger.info("TensorBoard reload process beginning") + for path, name in self._path_to_run.items(): + self._multiplexer.AddRunsFromDirectory(path, name) + logger.info( + "TensorBoard reload process: Reload the whole Multiplexer" + ) + self._multiplexer.Reload() + duration = time.time() - start + logger.info( + "TensorBoard done reloading. Load took %0.3f secs", + duration, + ) + except Exception: + logger.error( + "TensorBoard reload failed", exc_info=True + ) if self._reload_interval == 0: # Only load the multiplexer once. Do not continuously reload. break diff --git a/tensorboard/backend/event_processing/data_ingester_test.py b/tensorboard/backend/event_processing/data_ingester_test.py index 8a16963f20e..12fe564450e 100644 --- a/tensorboard/backend/event_processing/data_ingester_test.py +++ b/tensorboard/backend/event_processing/data_ingester_test.py @@ -61,6 +61,40 @@ def __init__( self.window_title = window_title +class ReloadErrorHandlingTest(tb_test.TestCase): + """Tests that the reload loop survives transient errors.""" + + def test_reload_continues_after_exception(self): + """Reload loop should log errors and continue, not crash.""" + flags = FakeFlags(logdir="logdir", reload_interval=0, reload_task="blocking") + ingester = data_ingester.LocalDataIngester(flags) + # Make AddRunsFromDirectory raise on the first call. + with mock.patch.object( + ingester._multiplexer, "AddRunsFromDirectory", side_effect=OSError("network error") + ) as mock_add: + with mock.patch.object( + ingester._multiplexer, "Reload" + ) as mock_reload: + # Should not raise despite the OSError. + ingester.start() + mock_add.assert_called_once() + # Reload should not be called since AddRunsFromDirectory raised first. + mock_reload.assert_not_called() + + def test_reload_continues_after_reload_exception(self): + """Reload loop should survive errors from Reload() as well.""" + flags = FakeFlags(logdir="logdir", reload_interval=0, reload_task="blocking") + ingester = data_ingester.LocalDataIngester(flags) + with mock.patch.object( + ingester._multiplexer, "AddRunsFromDirectory" + ): + with mock.patch.object( + ingester._multiplexer, "Reload", side_effect=RuntimeError("reload failed") + ): + # Should not raise despite the RuntimeError. + ingester.start() + + class GetEventFileActiveFilterTest(tb_test.TestCase): def testDisabled(self): flags = FakeFlags(logdir="logdir", reload_multifile=False)