-
Notifications
You must be signed in to change notification settings - Fork 12
Add exponential backoff to coordinator for persistent lock failures #888
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,15 +3,20 @@ | |
| from __future__ import annotations | ||
|
|
||
| from collections.abc import Callable | ||
| from datetime import datetime | ||
| from datetime import datetime, timedelta | ||
| import logging | ||
| from typing import TYPE_CHECKING, Any | ||
|
|
||
| from homeassistant.core import HomeAssistant, callback | ||
| from homeassistant.helpers.event import async_track_time_interval | ||
| from homeassistant.helpers.update_coordinator import DataUpdateCoordinator, UpdateFailed | ||
|
|
||
| from .const import DOMAIN | ||
| from .const import ( | ||
| BACKOFF_FAILURE_THRESHOLD, | ||
| BACKOFF_INITIAL_SECONDS, | ||
| BACKOFF_MAX_SECONDS, | ||
| DOMAIN, | ||
| ) | ||
| from .exceptions import LockCodeManagerError | ||
|
|
||
| if TYPE_CHECKING: | ||
|
|
@@ -40,6 +45,8 @@ def __init__(self, hass: HomeAssistant, lock: BaseLock, config_entry: Any) -> No | |
| config_entry=config_entry, | ||
| ) | ||
| self.data: dict[int, int | str] = {} | ||
| self._consecutive_failures: int = 0 | ||
| self._original_update_interval: timedelta | None = update_interval | ||
|
|
||
| # Set up drift detection timer for locks with hard_refresh_interval | ||
| if lock.hard_refresh_interval: | ||
|
|
@@ -74,18 +81,64 @@ def push_update(self, updates: dict[int, int | str]) -> None: | |
| if new_data == self.data: | ||
| return | ||
|
|
||
| # A successful push update proves the lock is reachable, so reset | ||
| # backoff to re-enable drift checks and normal polling. | ||
| self._reset_backoff() | ||
|
|
||
| self.async_set_updated_data(new_data) | ||
|
|
||
| def _apply_backoff(self) -> None: | ||
| """Increment failure counter and apply exponential backoff if threshold met.""" | ||
| self._consecutive_failures += 1 | ||
| if self._consecutive_failures >= BACKOFF_FAILURE_THRESHOLD: | ||
| backoff_secs = min( | ||
| BACKOFF_INITIAL_SECONDS | ||
| * 2 ** (self._consecutive_failures - BACKOFF_FAILURE_THRESHOLD), | ||
| BACKOFF_MAX_SECONDS, | ||
| ) | ||
| if self._original_update_interval is not None: | ||
| self.update_interval = timedelta(seconds=backoff_secs) | ||
| _LOGGER.warning( | ||
| "Update failed %d consecutive times for %s, " | ||
| "backing off polling interval to %ds", | ||
| self._consecutive_failures, | ||
| self._lock.lock.entity_id, | ||
| backoff_secs, | ||
| ) | ||
| else: | ||
| _LOGGER.warning( | ||
| "Update failed %d consecutive times for %s, " | ||
| "suppressing drift checks until recovery", | ||
| self._consecutive_failures, | ||
| self._lock.lock.entity_id, | ||
| ) | ||
|
|
||
| def _reset_backoff(self) -> None: | ||
| """Reset failure counter and restore original update interval.""" | ||
| if self._consecutive_failures > 0: | ||
| _LOGGER.info( | ||
| "Lock %s recovered after %d consecutive failures", | ||
| self._lock.lock.entity_id, | ||
| self._consecutive_failures, | ||
| ) | ||
| self._consecutive_failures = 0 | ||
| if self._original_update_interval is not None: | ||
| self.update_interval = self._original_update_interval | ||
|
|
||
| async def async_get_usercodes(self) -> dict[int, int | str]: | ||
| """Update usercodes.""" | ||
| try: | ||
| return await self._lock.async_internal_get_usercodes() | ||
| data = await self._lock.async_internal_get_usercodes() | ||
| except LockCodeManagerError as err: | ||
| self._apply_backoff() | ||
| # We can silently fail if we've never been able to retrieve data | ||
| if not self.last_update_success: | ||
| return {} | ||
| raise UpdateFailed from err | ||
|
|
||
| self._reset_backoff() | ||
| return data | ||
|
|
||
| async def _async_drift_check(self, now: datetime) -> None: | ||
| """ | ||
| Perform periodic drift detection. | ||
|
|
@@ -98,6 +151,14 @@ async def _async_drift_check(self, now: datetime) -> None: | |
| if not self.last_update_success: | ||
| return | ||
|
|
||
| if self._consecutive_failures >= BACKOFF_FAILURE_THRESHOLD: | ||
| _LOGGER.debug( | ||
| "Skipping drift check for %s (in backoff after %d failures)", | ||
| self._lock.lock.entity_id, | ||
| self._consecutive_failures, | ||
| ) | ||
| return | ||
raman325 marked this conversation as resolved.
Show resolved
Hide resolved
Comment on lines
+154
to
+160
|
||
|
|
||
| _LOGGER.debug( | ||
| "Performing drift detection hard refresh for %s", | ||
| self._lock.lock.entity_id, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In
_apply_backoff(), the first backoff step setsupdate_intervaltoBACKOFF_INITIAL_SECONDS(60s). With the current defaultusercode_scan_intervalalso being 60s, this doesn’t actually “back off” the polling interval, but the warning message claims it does. Consider only logging when the interval increases (compare previous vs new), and/or include both old/new intervals in the message so it’s not misleading/noisy.