From 8d7aae9122d87305d11e8a2dab1c07676ddc794e Mon Sep 17 00:00:00 2001 From: Victor Fusco <1221933+vfusco@users.noreply.github.com> Date: Sat, 7 Mar 2026 19:01:30 -0300 Subject: [PATCH 1/2] refactor(appstatus): unify inoperable handling and add recoverable FAILED state --- .../root/app/status/status.go | 49 +++- internal/advancer/advancer.go | 100 ++++--- internal/advancer/advancer_test.go | 58 +++- internal/appstatus/appstatus.go | 153 +++++++++++ internal/appstatus/appstatus_test.go | 250 ++++++++++++++++++ internal/claimer/claimer.go | 37 +-- internal/evmreader/evmreader.go | 19 +- internal/jsonrpc/jsonrpc-discover.json | 7 +- internal/manager/manager.go | 6 +- internal/model/models.go | 20 +- internal/prt/prt.go | 18 +- .../rollupsdb/public/enum/applicationstate.go | 2 + .../000001_create_initial_schema.up.sql | 31 ++- .../repotest/application_test_cases.go | 236 ++++++++++++++++- internal/validator/validator.go | 19 +- 15 files changed, 869 insertions(+), 136 deletions(-) create mode 100644 internal/appstatus/appstatus.go create mode 100644 internal/appstatus/appstatus_test.go diff --git a/cmd/cartesi-rollups-cli/root/app/status/status.go b/cmd/cartesi-rollups-cli/root/app/status/status.go index 3dc20c895..10d9f9311 100644 --- a/cmd/cartesi-rollups-cli/root/app/status/status.go +++ b/cmd/cartesi-rollups-cli/root/app/status/status.go @@ -4,6 +4,7 @@ package status import ( + "bufio" "fmt" "os" "strings" @@ -15,6 +16,8 @@ import ( "github.com/cartesi/rollups-node/internal/repository/factory" ) +var yesFlag bool + var Cmd = &cobra.Command{ Use: "status [app-name-or-address] [new-status]", Short: "Display or set application status (enabled or disabled)", @@ -31,9 +34,14 @@ cartesi-rollups-cli app status echo-dapp # Set application status: cartesi-rollups-cli app status echo-dapp enabled -cartesi-rollups-cli app status echo-dapp disabled` +cartesi-rollups-cli app status echo-dapp disabled + +# Re-enable a FAILED application without confirmation prompt: +cartesi-rollups-cli app status echo-dapp enabled --yes` func init() { + Cmd.Flags().BoolVarP(&yesFlag, "yes", "y", false, "Skip confirmation prompts") + origHelpFunc := Cmd.HelpFunc() Cmd.SetHelpFunc(func(command *cobra.Command, strings []string) { command.Flags().Lookup("verbose").Hidden = false @@ -62,9 +70,12 @@ func run(cmd *cobra.Command, args []string) { os.Exit(1) } - // If no new status is provided, just display the current status + // If no new status is provided, display the current status and reason if len(args) == 1 { fmt.Println(app.State) + if app.Reason != nil && *app.Reason != "" { + fmt.Printf("Reason: %s\n", *app.Reason) + } os.Exit(0) } @@ -72,7 +83,13 @@ func run(cmd *cobra.Command, args []string) { newStatus := strings.ToLower(args[1]) if app.State == model.ApplicationState_Inoperable { - fmt.Fprintf(os.Stderr, "Error: Cannot execute operation. Application %s is in %s state\n", app.Name, app.State) + fmt.Fprintf(os.Stderr, + "Error: Cannot change state of application %s. It is INOPERABLE (irrecoverable).\n", + app.Name) + if app.Reason != nil { + fmt.Fprintf(os.Stderr, "Reason: %s\n", *app.Reason) + } + fmt.Fprintf(os.Stderr, "Use 'app remove' to remove this application.\n") os.Exit(1) } @@ -92,6 +109,32 @@ func run(cmd *cobra.Command, args []string) { os.Exit(0) } + // Re-enabling a FAILED application requires confirmation + if app.State == model.ApplicationState_Failed && targetState == model.ApplicationState_Enabled && !yesFlag { + fmt.Printf("Application %q is in FAILED state.\n", app.Name) + if app.Reason != nil { + fmt.Printf("Reason: %s\n", *app.Reason) + } + fmt.Println("Re-enabling will attempt to restart processing from the last snapshot.") + fmt.Print("Proceed? [y/N] ") + scanner := bufio.NewScanner(os.Stdin) + scanner.Scan() + if scanErr := scanner.Err(); scanErr != nil { + fmt.Fprintf(os.Stderr, "Error reading input: %v\n", scanErr) + os.Exit(1) + } + answer := strings.TrimSpace(strings.ToLower(scanner.Text())) + if answer != "y" && answer != "yes" { + fmt.Println("Aborted.") + os.Exit(0) + } + } + + // Show failure reason when changing state away from FAILED + if app.State == model.ApplicationState_Failed && app.Reason != nil && *app.Reason != "" { + fmt.Printf("Previous failure reason: %s\n", *app.Reason) + } + err = repo.UpdateApplicationState(ctx, app.ID, targetState, nil) cobra.CheckErr(err) diff --git a/internal/advancer/advancer.go b/internal/advancer/advancer.go index 4b309b932..bb39c6b73 100644 --- a/internal/advancer/advancer.go +++ b/internal/advancer/advancer.go @@ -11,6 +11,7 @@ import ( "path/filepath" "strings" + "github.com/cartesi/rollups-node/internal/appstatus" "github.com/cartesi/rollups-node/internal/manager" . "github.com/cartesi/rollups-node/internal/model" "github.com/cartesi/rollups-node/internal/repository" @@ -57,8 +58,11 @@ func getUnprocessedInputs( return repo.ListInputs(ctx, appAddress, f, repository.Pagination{Limit: batchSize}, false) } -// Step performs one processing cycle of the advancer -// It updates machines, gets unprocessed inputs, processes them, and updates epochs +// Step performs one processing cycle of the advancer. +// It updates machines, gets unprocessed inputs, processes them, and updates epochs. +// Per-app errors are accumulated so that a failure in one application does not block +// processing of other healthy applications. Context cancellation is always propagated +// immediately. func (s *Service) Step(ctx context.Context) error { // Check for context cancellation if err := ctx.Err(); err != nil { @@ -74,38 +78,53 @@ func (s *Service) Step(ctx context.Context) error { // Get all applications with active machines apps := s.machineManager.Applications() - // Process inputs for each application + // Process inputs for each application, accumulating per-app errors + var errs []error for _, app := range apps { - appAddress := app.IApplicationAddress.String() + if err := s.stepApp(ctx, app); err != nil { + // Context cancellation means the node is shutting down — stop immediately. + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return err + } + errs = append(errs, err) + } + } - epochs, _, err := getUnprocessedEpochs(ctx, s.repository, appAddress) - if err != nil { + return errors.Join(errs...) +} + +// stepApp processes all unprocessed epochs and inputs for a single application. +func (s *Service) stepApp(ctx context.Context, app *Application) error { + appAddress := app.IApplicationAddress.String() + + epochs, _, err := getUnprocessedEpochs(ctx, s.repository, appAddress) + if err != nil { + return err + } + + for _, epoch := range epochs { + if err := s.processEpochInputs(ctx, app, epoch.Index); err != nil { return err } - for _, epoch := range epochs { - if err := s.processEpochInputs(ctx, app, epoch.Index); err != nil { - return err - } + if epoch.Status == EpochStatus_Closed { + if allProcessed, perr := s.isAllEpochInputsProcessed(app, epoch); perr == nil && allProcessed { + err := s.handleEpochAfterInputsProcessed(ctx, app, epoch) + if err != nil { + return err + } - if epoch.Status == EpochStatus_Closed { - if allProcessed, perr := s.isAllEpochInputsProcessed(app, epoch); perr == nil && allProcessed { - err := s.handleEpochAfterInputsProcessed(ctx, app, epoch) - if err != nil { - return err - } - - // Update epochs to mark inputs as processed - err = s.repository.UpdateEpochInputsProcessed(ctx, appAddress, epoch.Index) - if err != nil { - return err - } - s.Logger.Info("Epoch updated to Inputs Processed", "application", app.Name, "epoch_index", epoch.Index) - } else if perr != nil { - return perr - } else { - break // some inputs were not processed yet, check next time + // Update epochs to mark inputs as processed + err = s.repository.UpdateEpochInputsProcessed(ctx, appAddress, epoch.Index) + if err != nil { + return err } + s.Logger.Info("Epoch updated to Inputs Processed", + "application", app.Name, "epoch_index", epoch.Index) + } else if perr != nil { + return perr + } else { + break // some inputs were not processed yet, check next time } } } @@ -177,27 +196,27 @@ func (s *Service) processInputs(ctx context.Context, app *Application, inputs [] result, err := machine.Advance(ctx, input.RawData, input.EpochIndex, input.Index, app.IsDaveConsensus()) input.RawData = nil // allow GC to collect payload while batch continues if err != nil { - // If there's an error, mark the application as inoperable + // If there's an error, mark the application as failed s.Logger.Error("Error executing advance", "application", app.Name, "index", input.Index, "error", err) - // If the error is due to context cancellation, don't mark as inoperable + // If the error is due to context cancellation, don't mark as failed if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { return err } - reason := err.Error() - updateErr := s.repository.UpdateApplicationState(ctx, app.ID, ApplicationState_Inoperable, &reason) - if updateErr != nil { - s.Logger.Error("Failed to update application state", - "application", app.Name, - "error", updateErr) + if dbErr := appstatus.SetFailed(ctx, s.Logger, s.repository, app, err.Error()); dbErr != nil { + s.Logger.Error("Failed to persist FAILED state — machine will be closed "+ + "but app remains ENABLED in DB; it will be re-created from the "+ + "last snapshot on the next tick. If the root cause persists, "+ + "this may loop.", + "application", app.Name, "db_error", dbErr) } // Eagerly close the machine to release the child process. - // The app is already inoperable, so no further operations will succeed. + // The app has failed, so no further operations will succeed. // Skip if the runtime was already destroyed inside the manager. if !errors.Is(err, manager.ErrMachineClosed) { if closeErr := machine.Close(); closeErr != nil { @@ -320,11 +339,12 @@ func (s *Service) handleEpochAfterInputsProcessed(ctx context.Context, app *Appl outputsProof, err := machine.OutputsProof(ctx) if err != nil { // If the runtime was destroyed (e.g., child process crashed), - // mark the app inoperable to avoid an infinite retry loop. + // mark the app as failed to avoid an infinite retry loop. if errors.Is(err, manager.ErrMachineClosed) { - reason := err.Error() - _ = s.repository.UpdateApplicationState(ctx, app.ID, - ApplicationState_Inoperable, &reason) + if dbErr := appstatus.SetFailed(ctx, s.Logger, s.repository, app, err.Error()); dbErr != nil { + s.Logger.Error("Failed to persist FAILED state for crashed machine", + "application", app.Name, "db_error", dbErr) + } } return fmt.Errorf("failed to get outputs proof from machine: %w", err) } diff --git a/internal/advancer/advancer_test.go b/internal/advancer/advancer_test.go index 5ef6f60df..3f1ad7f95 100644 --- a/internal/advancer/advancer_test.go +++ b/internal/advancer/advancer_test.go @@ -211,6 +211,49 @@ func (s *AdvancerSuite) TestStep() { require.Nil(err) require.Len(env.repo.StoredResults, 0) }) + + s.Run("FailedAppDoesNotBlockOtherApps", func() { + require := s.Require() + + mm := newMockMachineManager() + app1 := newMockMachine(1) // will fail + app2 := newMockMachine(2) // should still be processed + mm.Map[1] = newMockInstance(app1) + mm.Map[2] = newMockInstance(app2) + res2 := randomAdvanceResult(0) + + repo := &MockRepository{ + GetEpochsReturn: map[common.Address][]*Epoch{ + app1.Application.IApplicationAddress: { + {Index: 0, Status: EpochStatus_Open}, + }, + app2.Application.IApplicationAddress: { + {Index: 0, Status: EpochStatus_Open}, + }, + }, + GetInputsReturn: map[common.Address][]*Input{ + app1.Application.IApplicationAddress: { + newInput(app1.Application.ID, 0, 0, []byte("advance error")), + }, + app2.Application.IApplicationAddress: { + newInput(app2.Application.ID, 0, 0, marshal(res2)), + }, + }, + } + + svc, err := newMockAdvancerService(mm, repo) + require.NoError(err) + + err = svc.Step(context.Background()) + // Step returns a combined error but the healthy app was still processed + require.Error(err) + require.Contains(err.Error(), "advance error") + require.Equal(1, repo.ApplicationStateUpdates) + require.Equal(ApplicationState_Failed, repo.LastApplicationState) + + // app2's input was processed despite app1's failure + require.Len(repo.StoredResults, 1) + }) } func (s *AdvancerSuite) TestGetUnprocessedInputs() { @@ -262,7 +305,7 @@ func (s *AdvancerSuite) TestProcess() { err := env.service.processInputs(context.Background(), env.app.Application, inputs) require.Error(err) require.Equal(1, env.repo.ApplicationStateUpdates) - require.Equal(ApplicationState_Inoperable, env.repo.LastApplicationState) + require.Equal(ApplicationState_Failed, env.repo.LastApplicationState) require.NotNil(env.repo.LastApplicationStateReason) require.Equal("advance error", *env.repo.LastApplicationStateReason) }) @@ -640,6 +683,19 @@ func (s *AdvancerSuite) TestHandleEpochAfterInputsProcessed() { require.Contains(err.Error(), "proof error") }) + s.Run("EmptyEpochIndex0ErrMachineClosedMarksAppFailed", func() { + require := s.Require() + env := s.setupOneApp() + env.app.OutputsProofError = manager.ErrMachineClosed + + epoch := &Epoch{Index: 0, Status: EpochStatus_Closed, InputIndexLowerBound: 0, InputIndexUpperBound: 0} + err := env.service.handleEpochAfterInputsProcessed(context.Background(), env.app.Application, epoch) + require.Error(err) + require.ErrorIs(err, manager.ErrMachineClosed) + require.Equal(1, env.repo.ApplicationStateUpdates) + require.Equal(ApplicationState_Failed, env.repo.LastApplicationState) + }) + s.Run("EmptyEpochIndexGt0RepeatsPreviousProof", func() { require := s.Require() env := s.setupOneApp() diff --git a/internal/appstatus/appstatus.go b/internal/appstatus/appstatus.go new file mode 100644 index 000000000..19ef3a798 --- /dev/null +++ b/internal/appstatus/appstatus.go @@ -0,0 +1,153 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +package appstatus + +import ( + "context" + "errors" + "fmt" + "log/slog" + + . "github.com/cartesi/rollups-node/internal/model" +) + +// maxReasonLength is the maximum length of a reason string stored in the +// database. The DB column is VARCHAR(4096); we leave margin to avoid +// constraint violations from deeply-nested error chains. +const maxReasonLength = 4000 + +// Repository is the minimal interface needed to update application state. +type Repository interface { + UpdateApplicationState(ctx context.Context, appID int64, state ApplicationState, reason *string) error +} + +// SetFailed marks an application as FAILED (recoverable). +// Use for machine runtime errors that can be resolved by operator intervention +// (e.g., OOM kill, process crash). The operator can re-enable the application +// after fixing the root cause. +// +// Recovery assumptions — FAILED is safe to re-enable only when: +// - The failure was a machine runtime error (not a DB desync). +// - The last snapshot is consistent with the database state. +// - Synchronize() will correctly replay inputs from the snapshot point. +// +// The reason parameter must be a pre-formatted string describing the failure. +// Returns the database error if the state update fails; returns nil on success. +func SetFailed( + ctx context.Context, + logger *slog.Logger, + repo Repository, + app *Application, + reason string, +) error { + return setApplicationState(ctx, logger, repo, app, ApplicationState_Failed, reason) +} + +// SetFailedf marks an application as FAILED with a formatted reason string. +// Returns the database error if the state update fails; returns nil on success. +// Unlike [SetInoperablef], this intentionally returns nil on success because +// FAILED is recoverable — callers typically continue with their own error. +func SetFailedf( + ctx context.Context, + logger *slog.Logger, + repo Repository, + app *Application, + reasonFmt string, + args ...any, +) error { + return SetFailed(ctx, logger, repo, app, fmt.Sprintf(reasonFmt, args...)) +} + +// SetInoperable marks an application as INOPERABLE (irrecoverable). +// Use for data corruption, state mismatches, invariant violations, and +// on-chain disagreements that cannot be resolved by restarting. +// +// The reason parameter must be a pre-formatted string describing the failure. +// Always returns a non-nil error containing the reason because INOPERABLE is +// a terminal state and callers should always stop processing the application. +func SetInoperable( + ctx context.Context, + logger *slog.Logger, + repo Repository, + app *Application, + reason string, +) error { + reason = truncateReason(reason) + dbErr := setApplicationState(ctx, logger, repo, app, ApplicationState_Inoperable, reason) + reasonErr := errors.New(reason) + if dbErr != nil { + return errors.Join(reasonErr, dbErr) + } + return reasonErr +} + +// SetInoperablef marks an application as INOPERABLE with a formatted reason string. +// It logs the transition, persists the state, and returns a non-nil error containing +// the reason (joined with the DB error if the update failed). +// This function always returns a non-nil error because INOPERABLE is a terminal state +// and callers should always stop processing the application. +func SetInoperablef( + ctx context.Context, + logger *slog.Logger, + repo Repository, + app *Application, + reasonFmt string, + args ...any, +) error { + return SetInoperable(ctx, logger, repo, app, fmt.Sprintf(reasonFmt, args...)) +} + +// truncateReason truncates a reason string to maxReasonLength to avoid +// exceeding the database VARCHAR(4096) constraint. +func truncateReason(reason string) string { + if len(reason) > maxReasonLength { + return reason[:maxReasonLength] + "... (truncated)" + } + return reason +} + +func setApplicationState( + ctx context.Context, + logger *slog.Logger, + repo Repository, + app *Application, + state ApplicationState, + reason string, +) error { + reason = truncateReason(reason) + + switch state { + case ApplicationState_Failed: + logger.Warn("marking application as failed (recoverable)", + "application", app.Name, + "address", app.IApplicationAddress.String(), + "reason", reason) + case ApplicationState_Inoperable: + logger.Error("marking application as inoperable (irrecoverable)", + "application", app.Name, + "address", app.IApplicationAddress.String(), + "reason", reason) + default: + logger.Error("marking application with unexpected state", + "application", app.Name, + "address", app.IApplicationAddress.String(), + "state", state, + "reason", reason) + } + + err := repo.UpdateApplicationState(ctx, app.ID, state, &reason) + if err != nil { + logger.Error("failed to update application state", + "application", app.Name, + "address", app.IApplicationAddress.String(), + "target_state", state, "error", err) + return err + } + + // Only update in-memory state when the DB write succeeds to keep + // the in-memory Application consistent with the database. + app.State = state + app.Reason = &reason + return nil +} diff --git a/internal/appstatus/appstatus_test.go b/internal/appstatus/appstatus_test.go new file mode 100644 index 000000000..07e48261a --- /dev/null +++ b/internal/appstatus/appstatus_test.go @@ -0,0 +1,250 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +package appstatus + +import ( + "context" + "errors" + "log/slog" + "strings" + "testing" + + . "github.com/cartesi/rollups-node/internal/model" + "github.com/ethereum/go-ethereum/common" + "github.com/stretchr/testify/suite" +) + +func TestAppStatus(t *testing.T) { + suite.Run(t, new(AppStatusSuite)) +} + +type AppStatusSuite struct{ suite.Suite } + +func newTestApp() *Application { + return &Application{ + ID: 42, + Name: "test-app", + IApplicationAddress: common.HexToAddress("0x1234567890abcdef1234567890abcdef12345678"), + IConsensusAddress: common.HexToAddress("0xabcdefabcdefabcdefabcdefabcdefabcdefabcd"), + IInputBoxAddress: common.HexToAddress("0x1111111111111111111111111111111111111111"), + State: ApplicationState_Enabled, + } +} + +type mockRepo struct { + lastAppID int64 + lastState ApplicationState + lastReason *string + err error + callCount int +} + +func (m *mockRepo) UpdateApplicationState( + _ context.Context, + appID int64, + state ApplicationState, + reason *string, +) error { + m.callCount++ + m.lastAppID = appID + m.lastState = state + m.lastReason = reason + return m.err +} + +func (s *AppStatusSuite) TestSetFailed() { + require := s.Require() + repo := &mockRepo{} + logger := slog.Default() + app := newTestApp() + + err := SetFailed(context.Background(), logger, repo, app, "machine crashed: OOM") + + require.NoError(err) + require.Equal(1, repo.callCount) + require.Equal(int64(42), repo.lastAppID) + require.Equal(ApplicationState_Failed, repo.lastState) + require.NotNil(repo.lastReason) + require.Equal("machine crashed: OOM", *repo.lastReason) + + // Verify in-memory state was updated + require.Equal(ApplicationState_Failed, app.State) + require.NotNil(app.Reason) + require.Equal("machine crashed: OOM", *app.Reason) +} + +func (s *AppStatusSuite) TestSetFailedf() { + require := s.Require() + repo := &mockRepo{} + logger := slog.Default() + app := newTestApp() + + err := SetFailedf(context.Background(), logger, repo, app, + "epoch %d input %d: %s", 5, 42, "timeout") + + require.NoError(err) + require.Equal(1, repo.callCount) + require.Equal(ApplicationState_Failed, repo.lastState) + require.NotNil(repo.lastReason) + require.Equal("epoch 5 input 42: timeout", *repo.lastReason) + + // Verify in-memory state was updated + require.Equal(ApplicationState_Failed, app.State) +} + +func (s *AppStatusSuite) TestSetInoperable() { + require := s.Require() + repo := &mockRepo{} + logger := slog.Default() + app := newTestApp() + + err := SetInoperable(context.Background(), logger, repo, app, "hash mismatch: 0xaa != 0xbb") + + // SetInoperable always returns a non-nil error (INOPERABLE is terminal) + require.Error(err) + require.Contains(err.Error(), "hash mismatch: 0xaa != 0xbb") + require.Equal(1, repo.callCount) + require.Equal(int64(42), repo.lastAppID) + require.Equal(ApplicationState_Inoperable, repo.lastState) + require.NotNil(repo.lastReason) + require.Equal("hash mismatch: 0xaa != 0xbb", *repo.lastReason) + + // Verify in-memory state was updated + require.Equal(ApplicationState_Inoperable, app.State) + require.NotNil(app.Reason) + require.Equal("hash mismatch: 0xaa != 0xbb", *app.Reason) +} + +func (s *AppStatusSuite) TestSetFailedDBError() { + require := s.Require() + dbErr := errors.New("db connection failed") + repo := &mockRepo{err: dbErr} + logger := slog.Default() + app := newTestApp() + + err := SetFailed(context.Background(), logger, repo, app, "process crashed") + + require.ErrorIs(err, dbErr) + require.Equal(1, repo.callCount) + require.Equal(ApplicationState_Failed, repo.lastState) + require.NotNil(repo.lastReason) + require.Equal("process crashed", *repo.lastReason) + + // In-memory state must NOT be updated on DB error to stay consistent + require.Equal(ApplicationState_Enabled, app.State) + require.Nil(app.Reason) +} + +func (s *AppStatusSuite) TestSetInoperableDBError() { + require := s.Require() + dbErr := errors.New("db connection failed") + repo := &mockRepo{err: dbErr} + logger := slog.Default() + app := newTestApp() + + err := SetInoperable(context.Background(), logger, repo, app, "state corruption") + + require.ErrorIs(err, dbErr) + require.Contains(err.Error(), "state corruption") + require.Equal(1, repo.callCount) + require.Equal(ApplicationState_Inoperable, repo.lastState) + + // In-memory state must NOT be updated on DB error to stay consistent + require.Equal(ApplicationState_Enabled, app.State) + require.Nil(app.Reason) +} + +func (s *AppStatusSuite) TestReasonStoredExactly() { + require := s.Require() + repo := &mockRepo{} + logger := slog.Default() + + err := SetFailed(context.Background(), logger, repo, newTestApp(), "epoch 5 input 42: timeout") + + require.NoError(err) + require.NotNil(repo.lastReason) + require.Equal("epoch 5 input 42: timeout", *repo.lastReason) +} + +func (s *AppStatusSuite) TestSetInoperablef() { + require := s.Require() + repo := &mockRepo{} + logger := slog.Default() + app := newTestApp() + + err := SetInoperablef(context.Background(), logger, repo, app, + "epoch %d: hash mismatch %s != %s", 5, "0xaa", "0xbb") + + // SetInoperablef always returns a non-nil error (INOPERABLE is terminal) + require.Error(err) + require.Contains(err.Error(), "epoch 5: hash mismatch 0xaa != 0xbb") + require.Equal(1, repo.callCount) + require.Equal(ApplicationState_Inoperable, repo.lastState) + require.NotNil(repo.lastReason) + require.Equal("epoch 5: hash mismatch 0xaa != 0xbb", *repo.lastReason) + + // Verify in-memory state was updated (DB succeeded) + require.Equal(ApplicationState_Inoperable, app.State) + require.NotNil(app.Reason) + require.Equal("epoch 5: hash mismatch 0xaa != 0xbb", *app.Reason) +} + +func (s *AppStatusSuite) TestSetInoperablefDBError() { + require := s.Require() + dbErr := errors.New("db connection failed") + repo := &mockRepo{err: dbErr} + logger := slog.Default() + app := newTestApp() + + err := SetInoperablef(context.Background(), logger, repo, app, "reason: %s", "test") + + require.Error(err) + require.ErrorIs(err, dbErr) + require.Contains(err.Error(), "reason: test") + + // In-memory state must NOT be updated on DB error + require.Equal(ApplicationState_Enabled, app.State) + require.Nil(app.Reason) +} + +func (s *AppStatusSuite) TestSetFailedfDBError() { + require := s.Require() + dbErr := errors.New("db connection failed") + repo := &mockRepo{err: dbErr} + logger := slog.Default() + + err := SetFailedf(context.Background(), logger, repo, newTestApp(), + "input %d: %s", 7, "crash") + + require.ErrorIs(err, dbErr) + require.Equal(1, repo.callCount) + require.Equal(ApplicationState_Failed, repo.lastState) + require.NotNil(repo.lastReason) + require.Equal("input 7: crash", *repo.lastReason) +} + +func (s *AppStatusSuite) TestReasonTruncation() { + require := s.Require() + repo := &mockRepo{} + logger := slog.Default() + app := newTestApp() + + // Create a reason string longer than maxReasonLength + longReason := strings.Repeat("x", maxReasonLength+500) + + err := SetFailed(context.Background(), logger, repo, app, longReason) + + require.NoError(err) + require.NotNil(repo.lastReason) + require.LessOrEqual(len(*repo.lastReason), maxReasonLength+len("... (truncated)")) + require.True(strings.HasSuffix(*repo.lastReason, "... (truncated)")) + + // Short reasons should pass through unchanged + repo2 := &mockRepo{} + app2 := newTestApp() + err = SetFailed(context.Background(), logger, repo2, app2, "short reason") + require.NoError(err) + require.NotNil(repo2.lastReason) + require.Equal("short reason", *repo2.lastReason) +} diff --git a/internal/claimer/claimer.go b/internal/claimer/claimer.go index f852d448f..5e9d3f3db 100644 --- a/internal/claimer/claimer.go +++ b/internal/claimer/claimer.go @@ -39,11 +39,11 @@ package claimer import ( "context" - "errors" "fmt" "math/big" "time" + "github.com/cartesi/rollups-node/internal/appstatus" "github.com/cartesi/rollups-node/internal/model" "github.com/cartesi/rollups-node/pkg/contracts/iconsensus" @@ -170,8 +170,7 @@ func (s *Service) submitClaimsAndUpdateDatabase( if err != nil { err = s.setApplicationInoperable( s.Context, - app.IApplicationAddress, - prevEpoch.ApplicationID, + app, "database mismatch on epochs. application: %v, epochs: %v (%v), %v (%v).", app.IApplicationAddress, prevEpoch.Index, @@ -196,8 +195,7 @@ func (s *Service) submitClaimsAndUpdateDatabase( if prevClaimSubmissionEvent == nil { err = s.setApplicationInoperable( s.Context, - app.IApplicationAddress, - app.ID, + app, "epoch has no matching event. application: %v, epoch: %v (%v).", app.IApplicationAddress, prevEpoch.Index, @@ -215,8 +213,7 @@ func (s *Service) submitClaimsAndUpdateDatabase( ) err = s.setApplicationInoperable( s.Context, - app.IApplicationAddress, - app.ID, + app, "epoch has an invalid event: %v, epoch: %v (%v). event: %v", currEpoch.Index, prevEpoch.Index, @@ -247,8 +244,7 @@ func (s *Service) submitClaimsAndUpdateDatabase( if !claimSubmittedEventMatches(app, currEpoch, currClaimSubmissionEvent) { err = s.setApplicationInoperable( s.Context, - app.IApplicationAddress, - app.ID, + app, "computed claim does not match event. computed_claim=%v, current_event=%v", currEpoch, currClaimSubmissionEvent, ) @@ -432,29 +428,13 @@ func (s *Service) acceptClaimsAndUpdateDatabase( return errs } -// setApplicationInoperable marks an application as inoperable with the given reason, -// logs any error that occurs during the update, and returns an error with the reason. func (s *Service) setApplicationInoperable( ctx context.Context, - iApplicationAddress common.Address, - id int64, + app *model.Application, reasonFmt string, args ...any, ) error { - reason := fmt.Sprintf(reasonFmt, args...) - appAddress := iApplicationAddress.String() - - // Log the reason first - s.Logger.Error(reason, "application", appAddress) - - // Update application state - err := s.repository.UpdateApplicationState(ctx, id, model.ApplicationState_Inoperable, &reason) - if err != nil { - s.Logger.Error("failed to update application state to inoperable", "app", appAddress, "err", err) - } - - // Return the error with the reason - return errors.New(reason) + return appstatus.SetInoperablef(ctx, s.Logger, s.repository, app, reasonFmt, args...) } func (s *Service) checkConsensusForAddressChange( @@ -467,8 +447,7 @@ func (s *Service) checkConsensusForAddressChange( if app.IConsensusAddress != newConsensusAddress { err = s.setApplicationInoperable( s.Context, - app.IApplicationAddress, - app.ID, + app, "consensus change detected. application: %v.", app.IApplicationAddress, ) diff --git a/internal/evmreader/evmreader.go b/internal/evmreader/evmreader.go index d8b1b45af..59729bd7e 100644 --- a/internal/evmreader/evmreader.go +++ b/internal/evmreader/evmreader.go @@ -15,6 +15,7 @@ import ( "github.com/ethereum/go-ethereum/ethclient" "github.com/ethereum/go-ethereum/rpc" + "github.com/cartesi/rollups-node/internal/appstatus" . "github.com/cartesi/rollups-node/internal/model" "github.com/cartesi/rollups-node/internal/repository" "github.com/cartesi/rollups-node/pkg/ethutil" @@ -104,24 +105,8 @@ func getAllRunningApplications(ctx context.Context, er EvmReaderRepository) ([]* return er.ListApplications(ctx, f, repository.Pagination{}, false) } -// setApplicationInoperable marks an application as inoperable with the given reason, -// logs any error that occurs during the update, and returns an error with the reason. func (r *Service) setApplicationInoperable(ctx context.Context, app *Application, reasonFmt string, args ...any) error { - reason := fmt.Sprintf(reasonFmt, args...) - appAddress := app.IApplicationAddress.String() - - // Log the reason first - r.Logger.Error(reason, "application", app.Name, "address", appAddress) - - // Update application state - err := r.repository.UpdateApplicationState(ctx, app.ID, ApplicationState_Inoperable, &reason) - if err != nil { - r.Logger.Error("failed to update application state to inoperable", - "application", app.Name, - "address", appAddress, "err", err) - } - // Return the error with the reason - return errors.New(reason) + return appstatus.SetInoperablef(ctx, r.Logger, r.repository, app, reasonFmt, args...) } // watchForNewBlocks watches for new blocks and reads new inputs based on the diff --git a/internal/jsonrpc/jsonrpc-discover.json b/internal/jsonrpc/jsonrpc-discover.json index 5f8cffda2..2bcc8f306 100644 --- a/internal/jsonrpc/jsonrpc-discover.json +++ b/internal/jsonrpc/jsonrpc-discover.json @@ -1,5 +1,5 @@ { - "openrpc": "1.0.0", + "openrpc": "1.4.1", "info": { "title": "Cartesi Rollups Node API", "version": "2.0.0", @@ -1030,7 +1030,9 @@ "$ref": "#/components/schemas/ApplicationState" }, "reason": { - "type": "string" + "type": "string", + "nullable": true, + "description": "Human-readable failure description. Non-null when state is FAILED or INOPERABLE; null otherwise." }, "iinputbox_block": { "$ref": "#/components/schemas/UnsignedInteger" @@ -1569,6 +1571,7 @@ "enum": [ "ENABLED", "DISABLED", + "FAILED", "INOPERABLE" ] }, diff --git a/internal/manager/manager.go b/internal/manager/manager.go index efb0cadfc..a4fd9733d 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -195,7 +195,7 @@ func (m *MachineManager) UpdateMachines(ctx context.Context) error { } } - // Remove machines for disabled applications + // Remove machines for non-enabled applications (disabled, failed, etc.) m.removeMachines(apps) return nil @@ -250,11 +250,11 @@ func (m *MachineManager) removeMachines(apps []*Application) { for id, machine := range m.machines { if _, present := activeApps[id]; !present { if m.logger != nil { - m.logger.Info("Application was disabled, shutting down machine", + m.logger.Info("Application is no longer enabled, shutting down machine", "application", machine.Application().Name) } if err := machine.Close(); err != nil && m.logger != nil { - m.logger.Warn("Failed to close machine for disabled application", + m.logger.Warn("Failed to close machine for non-enabled application", "application", machine.Application().Name, "error", err) } delete(m.machines, id) diff --git a/internal/model/models.go b/internal/model/models.go index 52408628e..bb20e3cfb 100644 --- a/internal/model/models.go +++ b/internal/model/models.go @@ -145,17 +145,29 @@ func (a *Application) IsDaveConsensus() bool { return a.ConsensusType == Consensus_PRT } +// ApplicationState represents the lifecycle state of an application. +// +// State machine transitions (enforced by DB trigger): +// +// ENABLED → DISABLED, FAILED, INOPERABLE +// DISABLED → ENABLED, INOPERABLE +// FAILED → ENABLED, DISABLED, INOPERABLE (recoverable by operator) +// INOPERABLE → (terminal, no transitions allowed) +// +// DISABLED → FAILED is blocked (app must be running to fail). type ApplicationState string const ( - ApplicationState_Enabled ApplicationState = "ENABLED" - ApplicationState_Disabled ApplicationState = "DISABLED" - ApplicationState_Inoperable ApplicationState = "INOPERABLE" + ApplicationState_Enabled ApplicationState = "ENABLED" // actively processing inputs + ApplicationState_Disabled ApplicationState = "DISABLED" // stopped by operator + ApplicationState_Failed ApplicationState = "FAILED" // recoverable failure (e.g., OOM, process crash) + ApplicationState_Inoperable ApplicationState = "INOPERABLE" // irrecoverable (data corruption, invariant violation) ) var ApplicationStateAllValues = []ApplicationState{ ApplicationState_Enabled, ApplicationState_Disabled, + ApplicationState_Failed, ApplicationState_Inoperable, } @@ -175,6 +187,8 @@ func (e *ApplicationState) Scan(value any) error { *e = ApplicationState_Enabled case "DISABLED": *e = ApplicationState_Disabled + case "FAILED": + *e = ApplicationState_Failed case "INOPERABLE": *e = ApplicationState_Inoperable default: diff --git a/internal/prt/prt.go b/internal/prt/prt.go index 0c9047701..278ea2ce8 100644 --- a/internal/prt/prt.go +++ b/internal/prt/prt.go @@ -16,6 +16,7 @@ import ( "github.com/ethereum/go-ethereum/core/types" "github.com/ethereum/go-ethereum/ethclient" + "github.com/cartesi/rollups-node/internal/appstatus" "github.com/cartesi/rollups-node/internal/merkle" . "github.com/cartesi/rollups-node/internal/model" "github.com/cartesi/rollups-node/internal/repository" @@ -98,23 +99,8 @@ func getAllSubTournaments( return r.ListTournaments(ctx, nameOrAddress, f, repository.Pagination{}, false) } -// setApplicationInoperable marks an application as inoperable with the given reason, -// logs any error that occurs during the update, and returns an error with the reason. func (s *Service) setApplicationInoperable(ctx context.Context, app *Application, reasonFmt string, args ...any) error { - reason := fmt.Sprintf(reasonFmt, args...) - appAddress := app.IApplicationAddress.String() - - // Log the reason first - s.Logger.Error(reason, "application", appAddress) - - // Update application state - err := s.repository.UpdateApplicationState(ctx, app.ID, ApplicationState_Inoperable, &reason) - if err != nil { - s.Logger.Error("failed to update application state to inoperable", "app", appAddress, "err", err) - } - - // Return the error with the reason - return errors.New(reason) + return appstatus.SetInoperablef(ctx, s.Logger, s.repository, app, reasonFmt, args...) } func (s *Service) saveTournamentEvents(ctx context.Context, app *Application, epoch *Epoch, diff --git a/internal/repository/postgres/db/rollupsdb/public/enum/applicationstate.go b/internal/repository/postgres/db/rollupsdb/public/enum/applicationstate.go index 9b7965cf4..0810ebdf3 100644 --- a/internal/repository/postgres/db/rollupsdb/public/enum/applicationstate.go +++ b/internal/repository/postgres/db/rollupsdb/public/enum/applicationstate.go @@ -12,9 +12,11 @@ import "github.com/go-jet/jet/v2/postgres" var ApplicationState = &struct { Enabled postgres.StringExpression Disabled postgres.StringExpression + Failed postgres.StringExpression Inoperable postgres.StringExpression }{ Enabled: postgres.NewEnumValue("ENABLED"), Disabled: postgres.NewEnumValue("DISABLED"), + Failed: postgres.NewEnumValue("FAILED"), Inoperable: postgres.NewEnumValue("INOPERABLE"), } diff --git a/internal/repository/postgres/schema/migrations/000001_create_initial_schema.up.sql b/internal/repository/postgres/schema/migrations/000001_create_initial_schema.up.sql index b965b7969..b9c3d5cc7 100644 --- a/internal/repository/postgres/schema/migrations/000001_create_initial_schema.up.sql +++ b/internal/repository/postgres/schema/migrations/000001_create_initial_schema.up.sql @@ -8,7 +8,7 @@ CREATE DOMAIN "uint64" AS NUMERIC(20, 0) CHECK (VALUE >= 0 AND VALUE <= 18446744 CREATE DOMAIN "hash" AS BYTEA CHECK (octet_length(VALUE) = 32); CREATE DOMAIN "data_availability" AS BYTEA CHECK (octet_length(VALUE) >= 4); -CREATE TYPE "ApplicationState" AS ENUM ('ENABLED', 'DISABLED', 'INOPERABLE'); +CREATE TYPE "ApplicationState" AS ENUM ('ENABLED', 'DISABLED', 'FAILED', 'INOPERABLE'); CREATE TYPE "InputCompletionStatus" AS ENUM ( 'NONE', @@ -91,7 +91,7 @@ CREATE TABLE "application" "processed_inputs" uint64 NOT NULL, "created_at" TIMESTAMPTZ NOT NULL DEFAULT NOW(), "updated_at" TIMESTAMPTZ NOT NULL DEFAULT NOW(), - CONSTRAINT "reason_required_for_inoperable" CHECK (NOT ("state" = 'INOPERABLE' AND ("reason" IS NULL OR LENGTH("reason") = 0))), + CONSTRAINT "reason_required_for_failure_states" CHECK (NOT ("state" IN ('FAILED', 'INOPERABLE') AND ("reason" IS NULL OR LENGTH("reason") = 0))), CONSTRAINT "application_pkey" PRIMARY KEY ("id") ); @@ -100,6 +100,33 @@ CREATE INDEX "application_data_availability_selector_idx" ON "application"(subst CREATE TRIGGER "application_set_updated_at" BEFORE UPDATE ON "application" FOR EACH ROW EXECUTE FUNCTION update_updated_at_column(); +CREATE OR REPLACE FUNCTION validate_application_state_transition() +RETURNS TRIGGER AS $$ +BEGIN + -- INOPERABLE is terminal: no state or reason changes allowed + IF OLD.state = 'INOPERABLE'::"ApplicationState" + AND (NEW.state <> OLD.state OR NEW.reason IS DISTINCT FROM OLD.reason) + THEN + RAISE EXCEPTION 'cannot change state or reason of an INOPERABLE application'; + END IF; + + -- DISABLED cannot transition to FAILED (app must be running to fail) + IF OLD.state = 'DISABLED'::"ApplicationState" AND NEW.state = 'FAILED'::"ApplicationState" THEN + RAISE EXCEPTION 'cannot transition from DISABLED to FAILED: application is not running'; + END IF; + + -- Clear stale reason when transitioning to ENABLED or DISABLED + IF NEW.state IN ('ENABLED'::"ApplicationState", 'DISABLED'::"ApplicationState") THEN + NEW.reason := NULL; + END IF; + + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +CREATE TRIGGER "application_validate_state_transition" BEFORE UPDATE ON "application" +FOR EACH ROW EXECUTE FUNCTION validate_application_state_transition(); + CREATE TABLE "execution_parameters" ( "application_id" INT PRIMARY KEY, "snapshot_policy" "SnapshotPolicy" NOT NULL DEFAULT 'NONE', diff --git a/internal/repository/repotest/application_test_cases.go b/internal/repository/repotest/application_test_cases.go index 5206225fb..d695fd2b1 100644 --- a/internal/repository/repotest/application_test_cases.go +++ b/internal/repository/repotest/application_test_cases.go @@ -284,15 +284,245 @@ func (s *ApplicationSuite) TestUpdateApplicationState() { app := NewApplicationBuilder().Create(s.Ctx, s.T(), s.Repo) s.Equal(ApplicationState_Enabled, app.State) - reason := "maintenance" - err := s.Repo.UpdateApplicationState(s.Ctx, app.ID, ApplicationState_Disabled, &reason) + err := s.Repo.UpdateApplicationState(s.Ctx, app.ID, ApplicationState_Disabled, nil) s.Require().NoError(err) got, err := s.Repo.GetApplication(s.Ctx, app.Name) s.Require().NoError(err) s.Equal(ApplicationState_Disabled, got.State) + s.Nil(got.Reason) + }) + + s.Run("TriggerClearsReasonOnEnabled", func() { + // Even if a reason is passed, the DB trigger clears it for ENABLED/DISABLED states + app := NewApplicationBuilder().Create(s.Ctx, s.T(), s.Repo) + + // First set to FAILED with a reason + reason := "machine crash" + err := s.Repo.UpdateApplicationState(s.Ctx, app.ID, ApplicationState_Failed, &reason) + s.Require().NoError(err) + + // Re-enable with a stale reason — trigger should clear it + staleReason := "should be cleared" + err = s.Repo.UpdateApplicationState(s.Ctx, app.ID, ApplicationState_Enabled, &staleReason) + s.Require().NoError(err) + + got, err := s.Repo.GetApplication(s.Ctx, app.Name) + s.Require().NoError(err) + s.Equal(ApplicationState_Enabled, got.State) + s.Nil(got.Reason) + }) +} + +func (s *ApplicationSuite) TestInoperableIsTerminal() { + // helper: create an app and transition it to INOPERABLE via UpdateApplicationState. + makeInoperable := func(reason string) *Application { + s.T().Helper() + app := NewApplicationBuilder().Create(s.Ctx, s.T(), s.Repo) + err := s.Repo.UpdateApplicationState( + s.Ctx, app.ID, ApplicationState_Inoperable, &reason) + s.Require().NoError(err) + return app + } + + s.Run("CannotChangeStateFromInoperable", func() { + reason := "irrecoverable error" + app := makeInoperable(reason) + + newReason := "re-enabling" + err := s.Repo.UpdateApplicationState( + s.Ctx, app.ID, ApplicationState_Enabled, &newReason) + s.Require().Error(err) + s.Contains(err.Error(), "INOPERABLE") + + got, err := s.Repo.GetApplication(s.Ctx, app.Name) + s.Require().NoError(err) + s.Equal(ApplicationState_Inoperable, got.State) + s.Require().NotNil(got.Reason) + s.Equal(reason, *got.Reason) + }) + + s.Run("CannotChangeReasonFromInoperable", func() { + reason := "original reason" + app := makeInoperable(reason) + + newReason := "different reason" + err := s.Repo.UpdateApplicationState( + s.Ctx, app.ID, ApplicationState_Inoperable, &newReason) + s.Require().Error(err) + + got, err := s.Repo.GetApplication(s.Ctx, app.Name) + s.Require().NoError(err) s.Require().NotNil(got.Reason) - s.Equal("maintenance", *got.Reason) + s.Equal(reason, *got.Reason) + }) + + s.Run("CanSetToInoperableFromOtherStates", func() { + app := NewApplicationBuilder().Create(s.Ctx, s.T(), s.Repo) + + reason := "fatal error" + err := s.Repo.UpdateApplicationState( + s.Ctx, app.ID, ApplicationState_Inoperable, &reason) + s.Require().NoError(err) + + got, err := s.Repo.GetApplication(s.Ctx, app.Name) + s.Require().NoError(err) + s.Equal(ApplicationState_Inoperable, got.State) + s.Require().NotNil(got.Reason) + s.Equal("fatal error", *got.Reason) + }) + + s.Run("InoperableToSameStateAndReasonIsNoOp", func() { + reason := "irrecoverable" + app := makeInoperable(reason) + + err := s.Repo.UpdateApplicationState( + s.Ctx, app.ID, ApplicationState_Inoperable, &reason) + s.Require().NoError(err) + + got, err := s.Repo.GetApplication(s.Ctx, app.Name) + s.Require().NoError(err) + s.Equal(ApplicationState_Inoperable, got.State) + s.Require().NotNil(got.Reason) + s.Equal(reason, *got.Reason) + }) +} + +func (s *ApplicationSuite) TestFailedStateLifecycle() { + // helper: create an app and transition it to FAILED. + makeFailed := func(reason string) *Application { + s.T().Helper() + app := NewApplicationBuilder().Create(s.Ctx, s.T(), s.Repo) + err := s.Repo.UpdateApplicationState( + s.Ctx, app.ID, ApplicationState_Failed, &reason) + s.Require().NoError(err) + return app + } + + s.Run("CanReEnableFromFailed", func() { + reason := "machine crashed" + app := makeFailed(reason) + + err := s.Repo.UpdateApplicationState( + s.Ctx, app.ID, ApplicationState_Enabled, nil) + s.Require().NoError(err) + + got, err := s.Repo.GetApplication(s.Ctx, app.Name) + s.Require().NoError(err) + s.Equal(ApplicationState_Enabled, got.State) + s.Nil(got.Reason) + }) + + s.Run("CanDisableFromFailed", func() { + reason := "process crash" + app := makeFailed(reason) + + err := s.Repo.UpdateApplicationState( + s.Ctx, app.ID, ApplicationState_Disabled, nil) + s.Require().NoError(err) + + got, err := s.Repo.GetApplication(s.Ctx, app.Name) + s.Require().NoError(err) + s.Equal(ApplicationState_Disabled, got.State) + }) + + s.Run("CanEscalateFromFailedToInoperable", func() { + app := makeFailed("machine error") + + reason := "data corruption detected" + err := s.Repo.UpdateApplicationState( + s.Ctx, app.ID, ApplicationState_Inoperable, &reason) + s.Require().NoError(err) + + got, err := s.Repo.GetApplication(s.Ctx, app.Name) + s.Require().NoError(err) + s.Equal(ApplicationState_Inoperable, got.State) + s.Require().NotNil(got.Reason) + s.Equal("data corruption detected", *got.Reason) + }) + + s.Run("ReasonClearedOnReEnable", func() { + app := makeFailed("OOM kill") + + err := s.Repo.UpdateApplicationState( + s.Ctx, app.ID, ApplicationState_Enabled, nil) + s.Require().NoError(err) + + got, err := s.Repo.GetApplication(s.Ctx, app.Name) + s.Require().NoError(err) + s.Equal(ApplicationState_Enabled, got.State) + s.Nil(got.Reason) + }) + + s.Run("FailedToFailedUpdatesReason", func() { + app := makeFailed("first crash") + + newReason := "second crash: different error" + err := s.Repo.UpdateApplicationState( + s.Ctx, app.ID, ApplicationState_Failed, &newReason) + s.Require().NoError(err) + + got, err := s.Repo.GetApplication(s.Ctx, app.Name) + s.Require().NoError(err) + s.Equal(ApplicationState_Failed, got.State) + s.Require().NotNil(got.Reason) + s.Equal("second crash: different error", *got.Reason) + }) + + s.Run("FullRecoveryCycle", func() { + // ENABLED -> FAILED -> ENABLED -> FAILED (verify full cycle works) + app := NewApplicationBuilder().Create(s.Ctx, s.T(), s.Repo) + + // First failure + reason1 := "crash 1" + err := s.Repo.UpdateApplicationState( + s.Ctx, app.ID, ApplicationState_Failed, &reason1) + s.Require().NoError(err) + + // Re-enable + err = s.Repo.UpdateApplicationState( + s.Ctx, app.ID, ApplicationState_Enabled, nil) + s.Require().NoError(err) + + got, err := s.Repo.GetApplication(s.Ctx, app.Name) + s.Require().NoError(err) + s.Equal(ApplicationState_Enabled, got.State) + s.Nil(got.Reason) + + // Second failure + reason2 := "crash 2" + err = s.Repo.UpdateApplicationState( + s.Ctx, app.ID, ApplicationState_Failed, &reason2) + s.Require().NoError(err) + + got, err = s.Repo.GetApplication(s.Ctx, app.Name) + s.Require().NoError(err) + s.Equal(ApplicationState_Failed, got.State) + s.Require().NotNil(got.Reason) + s.Equal("crash 2", *got.Reason) + }) +} + +func (s *ApplicationSuite) TestDisabledToFailedBlocked() { + s.Run("CannotTransitionFromDisabledToFailed", func() { + app := NewApplicationBuilder().Create(s.Ctx, s.T(), s.Repo) + + // First disable the app + err := s.Repo.UpdateApplicationState( + s.Ctx, app.ID, ApplicationState_Disabled, nil) + s.Require().NoError(err) + + // Attempt DISABLED -> FAILED should be blocked by trigger + reason := "should not work" + err = s.Repo.UpdateApplicationState( + s.Ctx, app.ID, ApplicationState_Failed, &reason) + s.Require().Error(err) + s.Contains(err.Error(), "DISABLED") + + // Verify state unchanged + got, err := s.Repo.GetApplication(s.Ctx, app.Name) + s.Require().NoError(err) + s.Equal(ApplicationState_Disabled, got.State) }) } diff --git a/internal/validator/validator.go b/internal/validator/validator.go index 2254b0f68..75e4700de 100644 --- a/internal/validator/validator.go +++ b/internal/validator/validator.go @@ -7,12 +7,12 @@ package validator import ( "context" - "errors" "fmt" "github.com/ethereum/go-ethereum/common" "github.com/ethereum/go-ethereum/crypto" + "github.com/cartesi/rollups-node/internal/appstatus" "github.com/cartesi/rollups-node/internal/config" "github.com/cartesi/rollups-node/internal/merkle" . "github.com/cartesi/rollups-node/internal/model" @@ -118,23 +118,8 @@ func getProcessedEpochs(ctx context.Context, er ValidatorRepository, address str return er.ListEpochs(ctx, address, f, repository.Pagination{}, false) } -// setApplicationInoperable marks an application as inoperable with the given reason, -// logs any error that occurs during the update, and returns an error with the reason. func (s *Service) setApplicationInoperable(ctx context.Context, app *Application, reasonFmt string, args ...any) error { - reason := fmt.Sprintf(reasonFmt, args...) - appAddress := app.IApplicationAddress.String() - - // Log the reason first - s.Logger.Error(reason, "application", appAddress) - - // Update application state - err := s.repository.UpdateApplicationState(ctx, app.ID, ApplicationState_Inoperable, &reason) - if err != nil { - s.Logger.Error("failed to update application state to inoperable", "app", appAddress, "err", err) - } - - // Return the error with the reason - return errors.New(reason) + return appstatus.SetInoperablef(ctx, s.Logger, s.repository, app, reasonFmt, args...) } // validateApplication calculates, validates and stores the claim and/or proofs From 504062000a690345d80b0039c93fa2cafab3a154 Mon Sep 17 00:00:00 2001 From: Victor Fusco <1221933+vfusco@users.noreply.github.com> Date: Tue, 10 Mar 2026 12:33:00 -0300 Subject: [PATCH 2/2] fix(jsonrpc): use JSON Schema Draft 07 nullable syntax --- internal/jsonrpc/jsonrpc-discover.json | 63 +++++++++----------------- 1 file changed, 22 insertions(+), 41 deletions(-) diff --git a/internal/jsonrpc/jsonrpc-discover.json b/internal/jsonrpc/jsonrpc-discover.json index 2bcc8f306..3697c495e 100644 --- a/internal/jsonrpc/jsonrpc-discover.json +++ b/internal/jsonrpc/jsonrpc-discover.json @@ -1030,8 +1030,7 @@ "$ref": "#/components/schemas/ApplicationState" }, "reason": { - "type": "string", - "nullable": true, + "type": ["string", "null"], "description": "Human-readable failure description. Non-null when state is FAILED or INOPERABLE; null otherwise." }, "iinputbox_block": { @@ -1118,38 +1117,31 @@ "$ref": "#/components/schemas/UnsignedInteger" }, "machine_hash": { - "$ref": "#/components/schemas/Hash", - "nullable": true + "oneOf": [{"$ref": "#/components/schemas/Hash"}, {"type": "null"}] }, "outputs_merkle_root": { - "$ref": "#/components/schemas/Hash", - "nullable": true + "oneOf": [{"$ref": "#/components/schemas/Hash"}, {"type": "null"}] }, "outputs_merkle_proof": { - "type": "array", + "type": ["array", "null"], "items": { "$ref": "#/components/schemas/Hash" - }, - "nullable": true + } }, "commitment": { - "$ref": "#/components/schemas/Hash", - "nullable": true + "oneOf": [{"$ref": "#/components/schemas/Hash"}, {"type": "null"}] }, "commitment_proof": { - "type": "array", + "type": ["array", "null"], "items": { "$ref": "#/components/schemas/Hash" - }, - "nullable": true + } }, "claim_transaction_hash": { - "$ref": "#/components/schemas/Hash", - "nullable": true + "oneOf": [{"$ref": "#/components/schemas/Hash"}, {"type": "null"}] }, "tournament_address": { - "$ref": "#/components/schemas/EthereumAddress", - "nullable": true + "oneOf": [{"$ref": "#/components/schemas/EthereumAddress"}, {"type": "null"}] }, "status": { "$ref": "#/components/schemas/EpochStatus" @@ -1219,19 +1211,16 @@ "$ref": "#/components/schemas/ByteArray" }, "decoded_data": { - "$ref": "#/components/schemas/EvmAdvance", - "nullable": true + "oneOf": [{"$ref": "#/components/schemas/EvmAdvance"}, {"type": "null"}] }, "status": { "$ref": "#/components/schemas/InputCompletionStatus" }, "machine_hash": { - "$ref": "#/components/schemas/Hash", - "nullable": true + "oneOf": [{"$ref": "#/components/schemas/Hash"}, {"type": "null"}] }, "outputs_hash": { - "$ref": "#/components/schemas/Hash", - "nullable": true + "oneOf": [{"$ref": "#/components/schemas/Hash"}, {"type": "null"}] }, "transaction_reference": { "$ref": "#/components/schemas/ByteArray" @@ -1329,23 +1318,19 @@ "$ref": "#/components/schemas/ByteArray" }, "decoded_data": { - "$ref": "#/components/schemas/DecodedOutput", - "nullable": true + "oneOf": [{"$ref": "#/components/schemas/DecodedOutput"}, {"type": "null"}] }, "hash": { - "$ref": "#/components/schemas/Hash", - "nullable": true + "oneOf": [{"$ref": "#/components/schemas/Hash"}, {"type": "null"}] }, "output_hashes_siblings": { - "type": "array", + "type": ["array", "null"], "items": { "$ref": "#/components/schemas/Hash" - }, - "nullable": true + } }, "execution_transaction_hash": { - "$ref": "#/components/schemas/Hash", - "nullable": true + "oneOf": [{"$ref": "#/components/schemas/Hash"}, {"type": "null"}] }, "created_at": { "type": "string", @@ -1649,12 +1634,10 @@ "$ref": "#/components/schemas/EthereumAddress" }, "parent_tournament_address": { - "$ref": "#/components/schemas/EthereumAddress", - "nullable": true + "oneOf": [{"$ref": "#/components/schemas/EthereumAddress"}, {"type": "null"}] }, "parent_match_id_hash": { - "$ref": "#/components/schemas/Hash", - "nullable": true + "oneOf": [{"$ref": "#/components/schemas/Hash"}, {"type": "null"}] }, "max_level": { "$ref": "#/components/schemas/UnsignedInteger" @@ -1669,12 +1652,10 @@ "$ref": "#/components/schemas/UnsignedInteger" }, "winner_commitment": { - "$ref": "#/components/schemas/Hash", - "nullable": true + "oneOf": [{"$ref": "#/components/schemas/Hash"}, {"type": "null"}] }, "final_state_hash": { - "$ref": "#/components/schemas/Hash", - "nullable": true + "oneOf": [{"$ref": "#/components/schemas/Hash"}, {"type": "null"}] }, "finished_at_block": { "$ref": "#/components/schemas/UnsignedInteger"