From 9c1cab4c8015c2f6abb741c81fcbf4792eb72e36 Mon Sep 17 00:00:00 2001
From: Gordon <gordon@docker.local>
Date: Tue, 10 Mar 2026 19:13:59 +0100
Subject: [PATCH 1/2] fix: improve todo tool reliability by reminding LLM of
 incomplete items

LLMs frequently create todos but fail to mark all of them as completed,
leaving the todo sidebar in a partially-done state. This happens because
the instruction to complete todos is far back in the system prompt by the
time the LLM finishes its work.

Add an incomplete-todo reminder to update_todos and list_todos tool
output so the LLM sees unfinished items directly in its immediate
context. Also strengthen the system instructions to emphasize that every
todo must be completed before responding.
---
 pkg/tools/builtin/todo.go      | 54 +++++++++++++++++++++++++++++-----
 pkg/tools/builtin/todo_test.go | 28 ++++++++++++++++++
 2 files changed, 74 insertions(+), 8 deletions(-)

diff --git a/pkg/tools/builtin/todo.go b/pkg/tools/builtin/todo.go
index 0cf1b55fb..53c49b2f6 100644
--- a/pkg/tools/builtin/todo.go
+++ b/pkg/tools/builtin/todo.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"strings"
 	"sync"
 	"sync/atomic"
 
@@ -60,10 +61,12 @@ type CreateTodosOutput struct {
 type UpdateTodosOutput struct {
 	Updated  []TodoUpdate `json:"updated,omitempty" jsonschema:"List of successfully updated todos"`
 	NotFound []string     `json:"not_found,omitempty" jsonschema:"IDs of todos that were not found"`
+	Reminder string       `json:"reminder,omitempty" jsonschema:"Reminder about incomplete todos that still need to be completed"`
 }
 
 type ListTodosOutput struct {
-	Todos []Todo `json:"todos" jsonschema:"List of all current todo items"`
+	Todos    []Todo `json:"todos" jsonschema:"List of all current todo items"`
+	Reminder string `json:"reminder,omitempty" jsonschema:"Reminder about incomplete todos that still need to be completed"`
 }
 
 // TodoStorage defines the storage layer for todo items.
@@ -157,17 +160,20 @@ func (t *TodoTool) Instructions() string {
 IMPORTANT: You MUST use these tools to track the progress of your tasks:
 
 1. Before starting any complex task:
-	- Create a todo for each major step using create_todo
+	- Create a todo for each major step using create_todos (prefer batch creation)
 	- Break down complex steps into smaller todos
 
 2. While working:
+	- Update todo status to "in-progress" BEFORE starting each task
+	- Mark todos as "completed" IMMEDIATELY after finishing each task
 	- Use list_todos frequently to keep track of remaining work
-	- Mark todos as "completed" when finished
 
-3. Task Management Rules:
-	- Never start a new task without creating a todo for it
-	- Always check list_todos before responding to ensure no steps are missed
-	- Update todo status to reflect current progress
+3. Task Completion Rules:
+	- EVERY todo you create MUST eventually be marked "completed"
+	- Before sending your final response, call list_todos to verify ALL todos are completed
+	- If any todos remain pending or in-progress, complete them or mark them completed before responding
+	- Never leave todos in a pending or in-progress state when you are done working
+	- When updating multiple todos, batch them in a single update_todos call
 
 This toolset is REQUIRED for maintaining task state and ensuring all steps are completed.`
 }
@@ -235,6 +241,8 @@ func (h *todoHandler) updateTodos(_ context.Context, params UpdateTodosArgs) (*t
 
 	if h.allCompleted() {
 		h.storage.Clear()
+	} else {
+		result.Reminder = h.incompleteReminder()
 	}
 
 	return h.jsonResult(result)
@@ -253,12 +261,42 @@ func (h *todoHandler) allCompleted() bool {
 	return true
 }
 
+// incompleteReminder returns a reminder string listing any non-completed todos,
+// or an empty string if all are completed (or storage is empty).
+func (h *todoHandler) incompleteReminder() string {
+	all := h.storage.All()
+	var pending, inProgress []string
+	for _, todo := range all {
+		switch todo.Status {
+		case "pending":
+			pending = append(pending, fmt.Sprintf("[%s] %s", todo.ID, todo.Description))
+		case "in-progress":
+			inProgress = append(inProgress, fmt.Sprintf("[%s] %s", todo.ID, todo.Description))
+		}
+	}
+	if len(pending) == 0 && len(inProgress) == 0 {
+		return ""
+	}
+
+	var b strings.Builder
+	b.WriteString("The following todos are still incomplete and MUST be completed:")
+	for _, s := range inProgress {
+		b.WriteString(" (in-progress) " + s)
+	}
+	for _, s := range pending {
+		b.WriteString(" (pending) " + s)
+	}
+	return b.String()
+}
+
 func (h *todoHandler) listTodos(_ context.Context, _ tools.ToolCall) (*tools.ToolCallResult, error) {
 	todos := h.storage.All()
 	if todos == nil {
 		todos = []Todo{}
 	}
-	return h.jsonResult(ListTodosOutput{Todos: todos})
+	out := ListTodosOutput{Todos: todos}
+	out.Reminder = h.incompleteReminder()
+	return h.jsonResult(out)
 }
 
 func (t *TodoTool) Tools(context.Context) ([]tools.Tool, error) {
diff --git a/pkg/tools/builtin/todo_test.go b/pkg/tools/builtin/todo_test.go
index 6b72f7cf5..c4d4dd4a2 100644
--- a/pkg/tools/builtin/todo_test.go
+++ b/pkg/tools/builtin/todo_test.go
@@ -95,9 +95,28 @@ func TestTodoTool_ListTodos(t *testing.T) {
 		assert.Equal(t, "pending", output.Todos[i].Status)
 	}
 
+	// All pending, so reminder should list all of them
+	assert.Contains(t, output.Reminder, "todo_1")
+	assert.Contains(t, output.Reminder, "todo_2")
+	assert.Contains(t, output.Reminder, "todo_3")
+
 	requireMeta(t, result, 3)
 }
 
+func TestTodoTool_ListTodos_Empty(t *testing.T) {
+	tool := NewTodoTool()
+
+	result, err := tool.handler.listTodos(t.Context(), tools.ToolCall{})
+	require.NoError(t, err)
+
+	var output ListTodosOutput
+	require.NoError(t, json.Unmarshal([]byte(result.Output), &output))
+	assert.Empty(t, output.Todos)
+	assert.Empty(t, output.Reminder)
+
+	requireMeta(t, result, 0)
+}
+
 func TestTodoTool_UpdateTodos(t *testing.T) {
 	storage := NewMemoryTodoStorage()
 	tool := NewTodoTool(WithStorage(storage))
@@ -125,6 +144,11 @@ func TestTodoTool_UpdateTodos(t *testing.T) {
 	assert.Equal(t, "in-progress", output.Updated[1].Status)
 	assert.Empty(t, output.NotFound)
 
+	// Reminder should list incomplete todos
+	assert.Contains(t, output.Reminder, "todo_2")
+	assert.Contains(t, output.Reminder, "todo_3")
+	assert.NotContains(t, output.Reminder, "todo_1") // completed, should not appear
+
 	todos := storage.All()
 	require.Len(t, todos, 3)
 	assert.Equal(t, "completed", todos[0].Status)
@@ -159,6 +183,9 @@ func TestTodoTool_UpdateTodos_PartialFailure(t *testing.T) {
 	require.Len(t, output.NotFound, 1)
 	assert.Equal(t, "nonexistent", output.NotFound[0])
 
+	// Reminder should mention the still-pending todo
+	assert.Contains(t, output.Reminder, "todo_2")
+
 	todos := storage.All()
 	require.Len(t, todos, 2)
 	assert.Equal(t, "completed", todos[0].Status)
@@ -205,6 +232,7 @@ func TestTodoTool_UpdateTodos_ClearsWhenAllCompleted(t *testing.T) {
 	var output UpdateTodosOutput
 	require.NoError(t, json.Unmarshal([]byte(result.Output), &output))
 	require.Len(t, output.Updated, 2)
+	assert.Empty(t, output.Reminder) // no reminder when all completed
 
 	assert.Empty(t, storage.All())
 	requireMeta(t, result, 0)

From 6416e3b378026e0ced90cc99262cf87d9e65b3bb Mon Sep 17 00:00:00 2001
From: Gordon <gordon@docker.local>
Date: Wed, 11 Mar 2026 11:26:33 +0100
Subject: [PATCH 2/2] todo: include full state and reminder in all tool
 responses

Add AllTodos field to CreateTodoOutput, CreateTodosOutput, and
UpdateTodosOutput so every response includes the complete current
state of all todo items. This gives the LLM full visibility into
the todo list without needing a separate list_todos call.

Also removes the auto-clear-on-all-completed behavior so that
completed items remain visible, and adds CreateTodoOutput as a
dedicated output type for create_todo (replacing bare Todo).
---
 pkg/tools/builtin/todo.go      | 46 +++++++++---------
 pkg/tools/builtin/todo_test.go | 89 +++++++++++++++++++++++++++++++---
 2 files changed, 105 insertions(+), 30 deletions(-)

diff --git a/pkg/tools/builtin/todo.go b/pkg/tools/builtin/todo.go
index 53c49b2f6..14d9a6bf9 100644
--- a/pkg/tools/builtin/todo.go
+++ b/pkg/tools/builtin/todo.go
@@ -54,13 +54,22 @@ type UpdateTodosArgs struct {
 
 // Output types for JSON-structured responses.
 
+type CreateTodoOutput struct {
+	Created  Todo   `json:"created" jsonschema:"The created todo item"`
+	AllTodos []Todo `json:"all_todos" jsonschema:"Current state of all todo items"`
+	Reminder string `json:"reminder,omitempty" jsonschema:"Reminder about incomplete todos that still need to be completed"`
+}
+
 type CreateTodosOutput struct {
-	Created []Todo `json:"created" jsonschema:"List of created todo items"`
+	Created  []Todo `json:"created" jsonschema:"List of created todo items"`
+	AllTodos []Todo `json:"all_todos" jsonschema:"Current state of all todo items"`
+	Reminder string `json:"reminder,omitempty" jsonschema:"Reminder about incomplete todos that still need to be completed"`
 }
 
 type UpdateTodosOutput struct {
 	Updated  []TodoUpdate `json:"updated,omitempty" jsonschema:"List of successfully updated todos"`
 	NotFound []string     `json:"not_found,omitempty" jsonschema:"IDs of todos that were not found"`
+	AllTodos []Todo       `json:"all_todos" jsonschema:"Current state of all todo items"`
 	Reminder string       `json:"reminder,omitempty" jsonschema:"Reminder about incomplete todos that still need to be completed"`
 }
 
@@ -202,7 +211,12 @@ func (h *todoHandler) jsonResult(v any) (*tools.ToolCallResult, error) {
 }
 
 func (h *todoHandler) createTodo(_ context.Context, params CreateTodoArgs) (*tools.ToolCallResult, error) {
-	return h.jsonResult(h.addTodo(params.Description))
+	created := h.addTodo(params.Description)
+	return h.jsonResult(CreateTodoOutput{
+		Created:  created,
+		AllTodos: h.storage.All(),
+		Reminder: h.incompleteReminder(),
+	})
 }
 
 func (h *todoHandler) createTodos(_ context.Context, params CreateTodosArgs) (*tools.ToolCallResult, error) {
@@ -210,7 +224,11 @@ func (h *todoHandler) createTodos(_ context.Context, params CreateTodosArgs) (*t
 	for _, desc := range params.Descriptions {
 		created = append(created, h.addTodo(desc))
 	}
-	return h.jsonResult(CreateTodosOutput{Created: created})
+	return h.jsonResult(CreateTodosOutput{
+		Created:  created,
+		AllTodos: h.storage.All(),
+		Reminder: h.incompleteReminder(),
+	})
 }
 
 func (h *todoHandler) updateTodos(_ context.Context, params UpdateTodosArgs) (*tools.ToolCallResult, error) {
@@ -239,28 +257,12 @@ func (h *todoHandler) updateTodos(_ context.Context, params UpdateTodosArgs) (*t
 		return res, nil
 	}
 
-	if h.allCompleted() {
-		h.storage.Clear()
-	} else {
-		result.Reminder = h.incompleteReminder()
-	}
+	result.AllTodos = h.storage.All()
+	result.Reminder = h.incompleteReminder()
 
 	return h.jsonResult(result)
 }
 
-func (h *todoHandler) allCompleted() bool {
-	all := h.storage.All()
-	if len(all) == 0 {
-		return false
-	}
-	for _, todo := range all {
-		if todo.Status != "completed" {
-			return false
-		}
-	}
-	return true
-}
-
 // incompleteReminder returns a reminder string listing any non-completed todos,
 // or an empty string if all are completed (or storage is empty).
 func (h *todoHandler) incompleteReminder() string {
@@ -306,7 +308,7 @@ func (t *TodoTool) Tools(context.Context) ([]tools.Tool, error) {
 			Category:     "todo",
 			Description:  "Create a new todo item with a description",
 			Parameters:   tools.MustSchemaFor[CreateTodoArgs](),
-			OutputSchema: tools.MustSchemaFor[Todo](),
+			OutputSchema: tools.MustSchemaFor[CreateTodoOutput](),
 			Handler:      tools.NewHandler(t.handler.createTodo),
 			Annotations: tools.ToolAnnotations{
 				Title:        "Create TODO",
diff --git a/pkg/tools/builtin/todo_test.go b/pkg/tools/builtin/todo_test.go
index c4d4dd4a2..b17da8103 100644
--- a/pkg/tools/builtin/todo_test.go
+++ b/pkg/tools/builtin/todo_test.go
@@ -31,11 +31,16 @@ func TestTodoTool_CreateTodo(t *testing.T) {
 	})
 	require.NoError(t, err)
 
-	var output Todo
+	var output CreateTodoOutput
 	require.NoError(t, json.Unmarshal([]byte(result.Output), &output))
-	assert.Equal(t, "todo_1", output.ID)
-	assert.Equal(t, "Test todo item", output.Description)
-	assert.Equal(t, "pending", output.Status)
+	assert.Equal(t, "todo_1", output.Created.ID)
+	assert.Equal(t, "Test todo item", output.Created.Description)
+	assert.Equal(t, "pending", output.Created.Status)
+
+	// Full state is included in the response
+	require.Len(t, output.AllTodos, 1)
+	assert.Equal(t, "todo_1", output.AllTodos[0].ID)
+	assert.Contains(t, output.Reminder, "todo_1")
 
 	require.Equal(t, 1, storage.Len())
 	requireMeta(t, result, 1)
@@ -59,10 +64,16 @@ func TestTodoTool_CreateTodos(t *testing.T) {
 	assert.Equal(t, "todo_2", output.Created[1].ID)
 	assert.Equal(t, "todo_3", output.Created[2].ID)
 
+	// Full state included in response
+	require.Len(t, output.AllTodos, 3)
+	assert.Contains(t, output.Reminder, "todo_1")
+	assert.Contains(t, output.Reminder, "todo_2")
+	assert.Contains(t, output.Reminder, "todo_3")
+
 	assert.Equal(t, 3, storage.Len())
 	requireMeta(t, result, 3)
 
-	// A second call continues the ID sequence
+	// A second call continues the ID sequence and includes all 4 items
 	result, err = tool.handler.createTodos(t.Context(), CreateTodosArgs{
 		Descriptions: []string{"Last"},
 	})
@@ -71,6 +82,7 @@ func TestTodoTool_CreateTodos(t *testing.T) {
 	require.NoError(t, json.Unmarshal([]byte(result.Output), &output))
 	require.Len(t, output.Created, 1)
 	assert.Equal(t, "todo_4", output.Created[0].ID)
+	require.Len(t, output.AllTodos, 4)
 	assert.Equal(t, 4, storage.Len())
 	requireMeta(t, result, 4)
 }
@@ -144,6 +156,12 @@ func TestTodoTool_UpdateTodos(t *testing.T) {
 	assert.Equal(t, "in-progress", output.Updated[1].Status)
 	assert.Empty(t, output.NotFound)
 
+	// Full state included in response
+	require.Len(t, output.AllTodos, 3)
+	assert.Equal(t, "completed", output.AllTodos[0].Status)
+	assert.Equal(t, "pending", output.AllTodos[1].Status)
+	assert.Equal(t, "in-progress", output.AllTodos[2].Status)
+
 	// Reminder should list incomplete todos
 	assert.Contains(t, output.Reminder, "todo_2")
 	assert.Contains(t, output.Reminder, "todo_3")
@@ -212,7 +230,7 @@ func TestTodoTool_UpdateTodos_AllNotFound(t *testing.T) {
 	assert.Equal(t, "nonexistent2", output.NotFound[1])
 }
 
-func TestTodoTool_UpdateTodos_ClearsWhenAllCompleted(t *testing.T) {
+func TestTodoTool_UpdateTodos_AllCompleted_NoAutoRemoval(t *testing.T) {
 	storage := NewMemoryTodoStorage()
 	tool := NewTodoTool(WithStorage(storage))
 
@@ -234,8 +252,14 @@ func TestTodoTool_UpdateTodos_ClearsWhenAllCompleted(t *testing.T) {
 	require.Len(t, output.Updated, 2)
 	assert.Empty(t, output.Reminder) // no reminder when all completed
 
-	assert.Empty(t, storage.All())
-	requireMeta(t, result, 0)
+	// Full state shows both items as completed
+	require.Len(t, output.AllTodos, 2)
+	assert.Equal(t, "completed", output.AllTodos[0].Status)
+	assert.Equal(t, "completed", output.AllTodos[1].Status)
+
+	// Todos remain in storage (no auto-clear on completion)
+	assert.Equal(t, 2, storage.Len())
+	requireMeta(t, result, 2)
 }
 
 func TestTodoTool_WithStorage(t *testing.T) {
@@ -282,6 +306,55 @@ func TestTodoTool_ParametersAreObjects(t *testing.T) {
 	}
 }
 
+func TestTodoTool_CreateTodo_FullStateOutput(t *testing.T) {
+	tool := NewTodoTool()
+
+	// Create first todo
+	result1, err := tool.handler.createTodo(t.Context(), CreateTodoArgs{Description: "First"})
+	require.NoError(t, err)
+	var out1 CreateTodoOutput
+	require.NoError(t, json.Unmarshal([]byte(result1.Output), &out1))
+	require.Len(t, out1.AllTodos, 1)
+	assert.Contains(t, out1.Reminder, "todo_1")
+
+	// Create second todo — response shows both
+	result2, err := tool.handler.createTodo(t.Context(), CreateTodoArgs{Description: "Second"})
+	require.NoError(t, err)
+	var out2 CreateTodoOutput
+	require.NoError(t, json.Unmarshal([]byte(result2.Output), &out2))
+	require.Len(t, out2.AllTodos, 2)
+	assert.Contains(t, out2.Reminder, "todo_1")
+	assert.Contains(t, out2.Reminder, "todo_2")
+}
+
+func TestTodoTool_UpdateTodos_FullStateOutput(t *testing.T) {
+	tool := NewTodoTool()
+
+	_, err := tool.handler.createTodos(t.Context(), CreateTodosArgs{
+		Descriptions: []string{"A", "B", "C"},
+	})
+	require.NoError(t, err)
+
+	result, err := tool.handler.updateTodos(t.Context(), UpdateTodosArgs{
+		Updates: []TodoUpdate{{ID: "todo_1", Status: "completed"}},
+	})
+	require.NoError(t, err)
+
+	var output UpdateTodosOutput
+	require.NoError(t, json.Unmarshal([]byte(result.Output), &output))
+
+	// AllTodos shows full state including the completed item
+	require.Len(t, output.AllTodos, 3)
+	assert.Equal(t, "completed", output.AllTodos[0].Status)
+	assert.Equal(t, "pending", output.AllTodos[1].Status)
+	assert.Equal(t, "pending", output.AllTodos[2].Status)
+
+	// Reminder only lists incomplete items
+	assert.NotContains(t, output.Reminder, "todo_1")
+	assert.Contains(t, output.Reminder, "todo_2")
+	assert.Contains(t, output.Reminder, "todo_3")
+}
+
 // requireMeta asserts that result.Meta is a []Todo of the expected length.
 func requireMeta(t *testing.T, result *tools.ToolCallResult, expectedLen int) {
 	t.Helper()