Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion dev-suite/src/memory/chroma_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,10 +246,15 @@ def query(
tiers: list[MemoryTier] | None = None,
n_results: int = 10,
task_id: str | None = None,
min_score: float | None = None,
) -> list[MemoryQueryResult]:
"""Query memory with optional tier, module, and task_id filters.

Returns list of MemoryQueryResult. Automatically excludes expired entries.

Args:
min_score: Minimum similarity score (0.0-1.0) to include. Entries
below this threshold are filtered out. None means no filtering.
"""
where_clauses: list[dict] = []
if tiers:
Expand Down Expand Up @@ -286,6 +291,10 @@ def query(
if expires > 0 and expires < now:
continue

score = 1 - dist
if min_score is not None and score < min_score:
continue

entries.append(
MemoryQueryResult(
content=doc,
Expand All @@ -299,7 +308,7 @@ def query(
verified=meta.get("verified", True),
sandbox_origin=meta.get("sandbox_origin", "none"),
mutable=meta.get("mutable", True),
score=1 - dist,
score=score,
)
)

Expand Down
7 changes: 6 additions & 1 deletion dev-suite/src/memory/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,13 @@ def query(
tiers: list[MemoryTier] | None = None,
n_results: int = 10,
task_id: str | None = None,
min_score: float | None = None,
) -> list[MemoryQueryResult]:
"""Query memory with optional filters. Excludes expired entries."""
"""Query memory with optional filters. Excludes expired entries.

Args:
min_score: Minimum similarity score (0.0-1.0). None = no filtering.
"""
...

def get_pending_approvals(self) -> list[dict]:
Expand Down
75 changes: 49 additions & 26 deletions dev-suite/src/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def _get_memory_store() -> MemoryStore:
def _fetch_memory_context(task_description: str) -> list[str]:
try:
store = _get_memory_store()
results = store.query(task_description, n_results=10)
results = store.query(task_description, n_results=10, min_score=0.3)
return [r.content for r in results]
except Exception:
return []
Expand Down Expand Up @@ -444,7 +444,17 @@ def developer_node(state: GraphState, config: RunnableConfig | None = None) -> d
content = _extract_text_content(response.content)
trace.append(f"developer: code generated ({len(content)} chars)")
logger.info("[DEV] done. tokens_used now=%d", tokens_used)
memory_writes.append({"content": f"Implemented blueprint {blueprint.task_id}: {blueprint.instructions[:200]}", "tier": "l1", "module": _infer_module(blueprint.target_files), "source_agent": "developer", "confidence": 1.0, "sandbox_origin": "locked-down", "related_files": ",".join(blueprint.target_files), "task_id": blueprint.task_id})
# Deduplicate: on retries, replace existing developer entry for this task_id
# rather than appending duplicates (trace review showed 3x identical entries).
new_entry = {"content": f"Implemented blueprint {blueprint.task_id}: {blueprint.instructions[:200]}", "tier": "l1", "module": _infer_module(blueprint.target_files), "source_agent": "developer", "confidence": 1.0, "sandbox_origin": "locked-down", "related_files": ",".join(blueprint.target_files), "task_id": blueprint.task_id}
replaced = False
for i, existing in enumerate(memory_writes):
if existing.get("task_id") == blueprint.task_id and existing.get("source_agent") == "developer":
memory_writes[i] = new_entry
replaced = True
break
if not replaced:
memory_writes.append(new_entry)
return {"generated_code": content, "status": WorkflowStatus.REVIEWING, "tokens_used": tokens_used, "trace": trace, "memory_writes": memory_writes, "tool_calls_log": tool_calls_log}


Expand All @@ -459,6 +469,42 @@ def apply_code_node(state: GraphState) -> dict:
if not blueprint:
trace.append("apply_code: no blueprint -- skipping")
return {"parsed_files": [], "trace": trace}

# Issue #105: Use per-task workspace from GraphState, fall back to env var
ws_from_state = state.get("workspace_root", "")
workspace_root = Path(ws_from_state).resolve() if ws_from_state else _get_workspace_root()

# When the developer used filesystem_write tools, the clean code is already
# on disk. Reading from disk avoids re-parsing the LLM's summary text which
# often contains markdown fences and prose that corrupt the code.
# Trace review (task-ba22eafa) showed 2 wasted retries from this mismatch.
tool_calls_log = state.get("tool_calls_log", [])
wrote_via_tools = any(
tc.get("tool") == "filesystem_write" and tc.get("success")
for tc in tool_calls_log
)

if wrote_via_tools and blueprint.target_files:
disk_files = []
for target_path in blueprint.target_files:
full_path = workspace_root / target_path
if full_path.is_file():
try:
content = full_path.read_text(encoding="utf-8")
disk_files.append({"path": target_path, "content": content})
except Exception as e:
logger.warning("[APPLY_CODE] Failed to read tool-written file %s: %s", target_path, e)
trace.append(f"apply_code: failed to read {target_path} from disk -- {e}")

if disk_files:
total_chars = sum(len(f["content"]) for f in disk_files)
trace.append(f"apply_code: read {len(disk_files)} file(s) from disk (tool-written, {total_chars:,} chars total)")
logger.info("[APPLY_CODE] Read %d tool-written files (%d chars) from %s", len(disk_files), total_chars, workspace_root)
return {"parsed_files": disk_files, "trace": trace}

trace.append("apply_code: tool-written files not found on disk, falling back to parser")

# Fallback: parse generated_code (single-shot mode or disk read failed)
try:
parsed = parse_generated_code(generated_code)
except CodeParserError as e:
Expand All @@ -468,9 +514,6 @@ def apply_code_node(state: GraphState) -> dict:
if not parsed:
trace.append("apply_code: parser returned no files")
return {"parsed_files": [], "trace": trace}
# Issue #105: Use per-task workspace from GraphState, fall back to env var
ws_from_state = state.get("workspace_root", "")
workspace_root = Path(ws_from_state).resolve() if ws_from_state else _get_workspace_root()
safe_files = validate_paths_for_workspace(parsed, workspace_root)
if len(safe_files) < len(parsed):
skipped = len(parsed) - len(safe_files)
Expand Down Expand Up @@ -671,18 +714,6 @@ def flush_memory_node(state: GraphState) -> dict:


async def _publish_code_async(state: dict) -> dict:
"""Push parsed files to a branch and open a PR.

This is an infrastructure node -- no LLM calls. It uses the
GitHubPRProvider directly via httpx for branch creation,
file push, and PR opening.

Guard chain (any condition skips gracefully):
1. No GITHUB_TOKEN configured
2. publish_pr is False (user opted out)
3. No parsed_files in state
4. No blueprint in state
"""
from .api.github_prs import github_pr_provider

trace = list(state.get("trace", []))
Expand Down Expand Up @@ -724,13 +755,10 @@ async def _publish_code_async(state: dict) -> dict:
return {"trace": trace}

trace.append(f"publish_code: pushing {len(parsed_files)} file(s)")
# Compute repo-relative paths: if workspace_root is a subdirectory
# of the repo, prefix file paths so the PR targets correct locations.
workspace_root = state.get("workspace_root", "")
repo_subdir = ""
if workspace_root:
ws = Path(workspace_root).resolve()
# Walk up to find .git directory — that's the repo root
for parent in [ws, *ws.parents]:
if (parent / ".git").exists():
try:
Expand Down Expand Up @@ -805,12 +833,7 @@ async def _publish_code_async(state: dict) -> dict:


def publish_code_node(state: GraphState) -> dict:
"""Push files to GitHub branch and open PR after QA passes.

Issue #89: This is an infrastructure node (no LLM calls).
Runs between QA-pass and flush_memory in the graph.
Uses _run_async() to bridge sync node execution to async GitHub API.
"""
"""Push files to GitHub branch and open PR after QA passes."""
return _run_async(_publish_code_async(state))


Expand Down
112 changes: 112 additions & 0 deletions dev-suite/tests/test_apply_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,3 +179,115 @@ def test_status_not_changed(self, tmp_path):

# status should not be in the result (unchanged)
assert "status" not in result


class TestApplyCodeDiskRead:
"""Tests for apply_code_node's disk-read path when developer used tools."""

def _make_state(self, **overrides):
from src.agents.architect import Blueprint

base = {
"task_description": "test task",
"blueprint": Blueprint(
task_id="test-disk",
target_files=["app.py"],
instructions="build the app",
constraints=[],
acceptance_criteria=["runs"],
),
"generated_code": "## Summary\n\nDone.\n\n# --- FILE: app.py ---\n```python\nprint('hello')\n```\n",
"failure_report": None,
"status": WorkflowStatus.BUILDING,
"retry_count": 0,
"tokens_used": 0,
"error_message": "",
"memory_context": [],
"memory_writes": [],
"trace": [],
"sandbox_result": None,
"parsed_files": [],
"tool_calls_log": [],
}
base.update(overrides)
return base

def test_prefers_disk_over_parser_when_tools_used(self, tmp_path):
"""When filesystem_write was used, read files from disk instead of re-parsing."""
from src.orchestrator import apply_code_node

# Write clean code to disk (simulating what filesystem_write did)
(tmp_path / "app.py").write_text("print('hello from disk')\n")

# generated_code has fences that would corrupt parsing
state = self._make_state(
workspace_root=str(tmp_path),
tool_calls_log=[
{"agent": "developer", "turn": 1, "tool": "filesystem_write",
"args_preview": "{}", "result_preview": "ok", "success": True},
],
)

result = apply_code_node(state)

assert len(result["parsed_files"]) == 1
assert result["parsed_files"][0]["path"] == "app.py"
assert result["parsed_files"][0]["content"] == "print('hello from disk')\n"
assert any("tool-written" in t for t in result["trace"])

Comment on lines +215 to +237
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Include the apply_code_node disk-read branch in this PR (or gate these tests until it lands).

These tests currently require behavior that is not present in src/orchestrator.py (see parser-only flow around Line 461-Line 505), which is why CI is failing here. Please ship the production disk-read path (successful developer filesystem_write → read from workspace → fallback to parser) in the same PR so this suite can pass.

Also applies to: 255-274

🧰 Tools
🪛 GitHub Actions: CI

[error] 235-235: Test failed: TestApplyCodeDiskRead.test_prefers_disk_over_parser_when_tools_used. Expected parsed_files[0].content to be "print('hello from disk')\n" but got "print('hello')".

def test_falls_back_to_parser_without_tools(self, tmp_path):
"""Without filesystem_write calls, uses the parser as before."""
from src.orchestrator import apply_code_node

state = self._make_state(
workspace_root=str(tmp_path),
generated_code="# --- FILE: app.py ---\nprint('from parser')\n",
tool_calls_log=[],
)

with patch("src.orchestrator._get_workspace_root", return_value=tmp_path):
result = apply_code_node(state)

assert len(result["parsed_files"]) == 1
assert "from parser" in result["parsed_files"][0]["content"]
assert not any("tool-written" in t for t in result["trace"])

def test_falls_back_when_disk_files_missing(self, tmp_path):
"""If tool-written files aren't on disk, falls back to parser."""
from src.orchestrator import apply_code_node

# Don't write anything to disk
state = self._make_state(
workspace_root=str(tmp_path),
generated_code="# --- FILE: app.py ---\nprint('fallback')\n",
tool_calls_log=[
{"agent": "developer", "turn": 1, "tool": "filesystem_write",
"args_preview": "{}", "result_preview": "ok", "success": True},
],
)

result = apply_code_node(state)

assert len(result["parsed_files"]) == 1
assert "fallback" in result["parsed_files"][0]["content"]
assert any("falling back" in t for t in result["trace"])

def test_ignores_failed_filesystem_write(self, tmp_path):
"""Failed filesystem_write calls don't trigger disk-read path."""
from src.orchestrator import apply_code_node

state = self._make_state(
workspace_root=str(tmp_path),
generated_code="# --- FILE: app.py ---\nprint('parsed')\n",
tool_calls_log=[
{"agent": "developer", "turn": 1, "tool": "filesystem_write",
"args_preview": "{}", "result_preview": "Error", "success": False},
],
)

with patch("src.orchestrator._get_workspace_root", return_value=tmp_path):
result = apply_code_node(state)

# Should use parser path since the write failed
assert len(result["parsed_files"]) == 1
assert "parsed" in result["parsed_files"][0]["content"]
30 changes: 30 additions & 0 deletions dev-suite/tests/test_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,36 @@ def test_query_filter_by_task_id(self, store):
assert len(results_a) == 1
assert results_a[0].task_id == "task-a"

def test_query_min_score_filters_irrelevant(self, store):
"""min_score param filters out low-relevance entries."""
store.add_l1("Python triforce ASCII art pattern printer")
store.add_l1("JavaScript greet function with type hints")
# Query for triforce — greet entry should be dissimilar
all_results = store.query("triforce ASCII art", min_score=None)
assert len(all_results) == 2 # both returned without filter
# With a score threshold, only the relevant entry should survive
filtered = store.query("triforce ASCII art", min_score=0.3)
# The triforce entry should score higher than the greet entry
assert len(filtered) >= 1
assert any("triforce" in r.content.lower() for r in filtered)

def test_query_min_score_none_returns_all(self, store):
"""min_score=None (default) returns all results like before."""
store.add_l1("Alpha entry about dogs")
store.add_l1("Beta entry about cats")
store.add_l1("Gamma entry about quantum physics")
results_none = store.query("dogs", min_score=None)
results_default = store.query("dogs")
assert len(results_none) == len(results_default)

def test_query_min_score_high_threshold(self, store):
"""Very high min_score filters out most results."""
store.add_l1("Specific technical entry about Rust ownership")
store.add_l1("Completely unrelated cooking recipe for pasta")
results = store.query("Rust ownership borrow checker", min_score=0.99)
# At 0.99 threshold, almost nothing should match
assert len(results) <= 1


class TestBackwardCompatibility:
def test_add_and_query_l0_core(self, store):
Expand Down
Loading
Loading