@@ -981,8 +981,33 @@ def DeleteTaskHub(self, request: pb.DeleteTaskHubRequest, context):
981981 return pb .DeleteTaskHubResponse ()
982982
983983 def RewindInstance (self , request : pb .RewindInstanceRequest , context ):
984- """Rewinds an orchestration instance (not implemented)."""
985- context .abort (grpc .StatusCode .UNIMPLEMENTED , "RewindInstance not implemented" )
984+ """Rewinds a failed orchestration instance.
985+
986+ The backend validates the instance is in a failed state, appends
987+ an ``ExecutionRewoundEvent`` to the pending events, resets the
988+ instance status to RUNNING, and re-enqueues the orchestration
989+ so the worker can replay it and produce a
990+ ``RewindOrchestrationAction`` with the corrected history.
991+ """
992+ with self ._lock :
993+ instance = self ._instances .get (request .instanceId )
994+ if not instance :
995+ context .abort (
996+ grpc .StatusCode .NOT_FOUND ,
997+ f"Orchestration instance '{ request .instanceId } ' not found" )
998+ return pb .RewindInstanceResponse ()
999+
1000+ if instance .status != pb .ORCHESTRATION_STATUS_FAILED :
1001+ context .abort (
1002+ grpc .StatusCode .FAILED_PRECONDITION ,
1003+ f"Orchestration instance '{ request .instanceId } ' is not in a failed state" )
1004+ return pb .RewindInstanceResponse ()
1005+
1006+ reason = request .reason .value if request .HasField ("reason" ) else None
1007+ self ._prepare_rewind (instance , reason )
1008+
1009+ self ._logger .info (f"Rewound instance '{ request .instanceId } '" )
1010+ return pb .RewindInstanceResponse ()
9861011
9871012 def AbandonTaskActivityWorkItem (self , request : pb .AbandonActivityTaskRequest , context ):
9881013 """Abandons an activity work item."""
@@ -1196,6 +1221,8 @@ def _process_action(self, instance: OrchestrationInstance, action: pb.Orchestrat
11961221 self ._process_send_event_action (action .sendEvent )
11971222 elif action .HasField ("sendEntityMessage" ):
11981223 self ._process_send_entity_message_action (instance , action )
1224+ elif action .HasField ("rewindOrchestration" ):
1225+ self ._process_rewind_orchestration_action (instance , action .rewindOrchestration )
11991226
12001227 def _process_complete_orchestration_action (self , instance : OrchestrationInstance ,
12011228 complete_action : pb .CompleteOrchestrationAction ):
@@ -1205,6 +1232,14 @@ def _process_complete_orchestration_action(self, instance: OrchestrationInstance
12051232 instance .output = complete_action .result .value if complete_action .result else None
12061233 instance .failure_details = complete_action .failureDetails if complete_action .failureDetails else None
12071234
1235+ # Append orchestratorCompleted to history when the orchestration
1236+ # reaches a terminal state. This positional marker allows the
1237+ # SDK to distinguish a post-rewind replay from a new rewind
1238+ # request by comparing the position of the last
1239+ # orchestratorCompleted against the last executionRewound.
1240+ if status != pb .ORCHESTRATION_STATUS_CONTINUED_AS_NEW :
1241+ instance .history .append (helpers .new_orchestrator_completed_event ())
1242+
12081243 if status == pb .ORCHESTRATION_STATUS_CONTINUED_AS_NEW :
12091244 # Handle continue-as-new
12101245 new_input = complete_action .result .value if complete_action .result else None
@@ -1558,6 +1593,119 @@ def _signal_entity_internal(self, entity_id: str, operation: str,
15581593 )
15591594 self ._queue_entity_operation (entity_id , event )
15601595
1596+ def _prepare_rewind (self , instance : OrchestrationInstance ,
1597+ reason : Optional [str ] = None ):
1598+ """Prepares an orchestration instance for rewind.
1599+
1600+ Appends an ``ExecutionRewoundEvent`` to the pending events, resets
1601+ the instance status to RUNNING, and re-enqueues it so the worker
1602+ can replay it. The actual history rewriting is done by the SDK
1603+ worker when it processes the rewind event.
1604+
1605+ Args:
1606+ instance: The orchestration instance to rewind.
1607+ reason: Optional reason string for the rewind.
1608+
1609+ Note:
1610+ Must be called while holding ``self._lock``.
1611+ """
1612+ # Reset instance state so it can be re-processed.
1613+ instance .status = pb .ORCHESTRATION_STATUS_RUNNING
1614+ instance .output = None
1615+ instance .failure_details = None
1616+ instance .last_updated_at = datetime .now (timezone .utc )
1617+
1618+ # Clear any stale dispatched events.
1619+ instance .dispatched_events .clear ()
1620+
1621+ # Add the ExecutionRewound event as a new pending event.
1622+ rewind_event = pb .HistoryEvent (
1623+ eventId = - 1 ,
1624+ timestamp = timestamp_pb2 .Timestamp (),
1625+ executionRewound = pb .ExecutionRewoundEvent (
1626+ reason = wrappers_pb2 .StringValue (value = reason ) if reason else None ,
1627+ ),
1628+ )
1629+ instance .pending_events .append (rewind_event )
1630+
1631+ # Refresh the completion token and enqueue.
1632+ instance .completion_token = self ._next_completion_token
1633+ self ._next_completion_token += 1
1634+ self ._orchestration_in_flight .discard (instance .instance_id )
1635+ self ._enqueue_orchestration (instance .instance_id )
1636+
1637+ def _process_rewind_orchestration_action (
1638+ self , instance : OrchestrationInstance ,
1639+ rewind_action : pb .RewindOrchestrationAction ):
1640+ """Processes a RewindOrchestrationAction returned by the SDK.
1641+
1642+ The action contains a ``newHistory`` field with the rewritten
1643+ history computed by the SDK (failed tasks and sub-orchestration
1644+ failures removed). The backend replaces the instance's history
1645+ with this new history, recursively rewinds any failed
1646+ sub-orchestrations, and re-enqueues the orchestration.
1647+ """
1648+ new_history = list (rewind_action .newHistory )
1649+
1650+ # Replace history with the rewritten version.
1651+ instance .history = new_history
1652+ instance .status = pb .ORCHESTRATION_STATUS_RUNNING
1653+ instance .output = None
1654+ instance .failure_details = None
1655+ instance .last_updated_at = datetime .now (timezone .utc )
1656+
1657+ # Identify sub-orchestrations that were created but did not
1658+ # complete successfully — they need to be recursively rewound.
1659+ completed_sub_orch_task_ids : set [int ] = set ()
1660+ created_sub_orchs : dict [int , str ] = {}
1661+ for event in new_history :
1662+ if event .HasField ("subOrchestrationInstanceCreated" ):
1663+ created_sub_orchs [event .eventId ] = (
1664+ event .subOrchestrationInstanceCreated .instanceId )
1665+ elif event .HasField ("subOrchestrationInstanceCompleted" ):
1666+ completed_sub_orch_task_ids .add (
1667+ event .subOrchestrationInstanceCompleted .taskScheduledId )
1668+
1669+ # Extract the rewind reason from the last ExecutionRewound event.
1670+ reason : Optional [str ] = None
1671+ for event in reversed (new_history ):
1672+ if event .HasField ("executionRewound" ):
1673+ if event .executionRewound .HasField ("reason" ):
1674+ reason = event .executionRewound .reason .value
1675+ break
1676+
1677+ # Recursively rewind failed sub-orchestrations.
1678+ for task_id , sub_instance_id in created_sub_orchs .items ():
1679+ if task_id not in completed_sub_orch_task_ids :
1680+ sub_instance = self ._instances .get (sub_instance_id )
1681+ if (sub_instance
1682+ and sub_instance .status == pb .ORCHESTRATION_STATUS_FAILED ):
1683+ self ._prepare_rewind (sub_instance , reason )
1684+ self ._watch_sub_orchestration (
1685+ instance .instance_id , sub_instance_id , task_id )
1686+
1687+ # Re-enqueue so the orchestration replays with the clean history.
1688+ # The executionRewound event is added to pending_events so the
1689+ # worker can see it in new_events; the worker uses the presence
1690+ # of executionRewound in old_events (history) to distinguish
1691+ # this normal post-rewind replay from the initial rewind
1692+ # short-circuit. Note: we do NOT add orchestratorStarted here
1693+ # because the work-item dispatch loop already inserts one when
1694+ # the instance has non-empty history.
1695+ rewind_event = None
1696+ for event in new_history :
1697+ if event .HasField ("executionRewound" ):
1698+ rewind_event = event
1699+ break
1700+ instance .pending_events .clear ()
1701+ instance .dispatched_events .clear ()
1702+ if rewind_event is not None :
1703+ instance .pending_events .append (rewind_event )
1704+ instance .completion_token = self ._next_completion_token
1705+ self ._next_completion_token += 1
1706+ self ._orchestration_in_flight .discard (instance .instance_id )
1707+ self ._enqueue_orchestration (instance .instance_id )
1708+
15611709 def _enqueue_entity (self , entity_id : str ):
15621710 """Enqueues an entity for processing."""
15631711 if entity_id not in self ._entity_queue_set :
0 commit comments