databricks-solutions · calreynolds · Mar 4, 2026 · Feb 16, 2026 · Feb 16, 2026 · Feb 16, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,7 @@
 # Databricks AI Dev Kit
 .ai-dev-kit/
 .claude/
-
+.local
 
 # Python
 __pycache__/

diff --git a/.test/README.md b/.test/README.md
@@ -233,3 +233,17 @@ uv pip install -e ".test/"
 uv run pytest .test/tests/
 uv run python .test/scripts/regression.py <skill-name>
 ```
+
+---
+
+## Troubleshooting
+
+### MLflow evaluation not returning results
+
+If `/skill-test <skill-name> mlflow` hangs or doesn't return results, run manually with debug logging:
+
+```bash
+MLFLOW_LOG_LEVEL=DEBUG uv run python .test/scripts/mlflow_eval.py <skill-name>
+```
+
+This will show detailed MLflow API calls and help identify connection or authentication issues.
diff --git a/.test/baselines/databricks-synthetic-data-gen/baseline.yaml b/.test/baselines/databricks-synthetic-data-gen/baseline.yaml
@@ -0,0 +1,21 @@
+run_id: '20260303_071721'
+created_at: '2026-03-03T07:17:21.838623'
+skill_name: databricks-synthetic-data-gen
+metrics:
+  pass_rate: 1.0
+  total_tests: 4
+  passed_tests: 4
+  failed_tests: 0
+test_results:
+- id: grp_20260302_113344
+  passed: true
+  execution_mode: local
+- id: gen_serverless_job_catalog_json_002
+  passed: true
+  execution_mode: local
+- id: grp_20260302_retail_csv_3tables_003
+  passed: true
+  execution_mode: local
+- id: grp_20260303_manufacturing_delta_streaming_004
+  passed: true
+  execution_mode: local
diff --git a/.test/scripts/mlflow_eval.py b/.test/scripts/mlflow_eval.py
@@ -2,29 +2,65 @@
 """Run MLflow evaluation for a skill.
 
 Usage:
-    python mlflow_eval.py <skill_name> [--filter-category <category>] [--run-name <name>]
+    python mlflow_eval.py <skill_name> [--filter-category <category>] [--run-name <name>] [--timeout <seconds>]
 
 Environment Variables:
     DATABRICKS_CONFIG_PROFILE - Databricks CLI profile (default: "DEFAULT")
     MLFLOW_TRACKING_URI - Set to "databricks" for Databricks MLflow
     MLFLOW_EXPERIMENT_NAME - Experiment path (e.g., "/Users/{user}/skill-test")
+    MLFLOW_LLM_JUDGE_TIMEOUT - Timeout in seconds for LLM judge evaluation (default: 120)
 """
+import os
 import sys
+import signal
 import argparse
 
+# Close stdin and disable tqdm progress bars when run non-interactively
+# This fixes hanging issues with tqdm/MLflow progress bars in background tasks
+if not sys.stdin.isatty():
+    try:
+        sys.stdin.close()
+        sys.stdin = open(os.devnull, 'r')
+    except Exception:
+        pass
+    # Disable tqdm progress bars
+    os.environ.setdefault("TQDM_DISABLE", "1")
+
 # Import common utilities
 from _common import setup_path, print_result, handle_error
 
 
+class TimeoutException(Exception):
+    pass
+
+
+def timeout_handler(signum, frame):
+    raise TimeoutException("MLflow evaluation timed out")
+
+
 def main():
     parser = argparse.ArgumentParser(description="Run MLflow evaluation for a skill")
     parser.add_argument("skill_name", help="Name of skill to evaluate")
     parser.add_argument("--filter-category", help="Filter by test category")
     parser.add_argument("--run-name", help="Custom MLflow run name")
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=120,
+        help="Timeout in seconds for evaluation (default: 120)",
+    )
     args = parser.parse_args()
 
     setup_path()
 
+    # Set up signal-based timeout (Unix only)
+    if hasattr(signal, 'SIGALRM'):
+        signal.signal(signal.SIGALRM, timeout_handler)
+        signal.alarm(args.timeout)
+    else:
+        # Windows: SIGALRM not available - no timeout enforcement
+        print("WARNING: Timeout not supported on Windows - test may run indefinitely", file=sys.stderr)
+
     try:
         from skill_test.runners import evaluate_skill
 
@@ -34,6 +70,10 @@ def main():
             run_name=args.run_name,
         )
 
+        # Cancel the alarm if we succeeded
+        if hasattr(signal, 'SIGALRM'):
+            signal.alarm(0)
+
         # Convert to standard result format
         if result.get("run_id"):
             result["success"] = True
@@ -42,7 +82,19 @@ def main():
 
         sys.exit(print_result(result))
 
+    except TimeoutException as e:
+        result = {
+            "success": False,
+            "skill_name": args.skill_name,
+            "error": f"Evaluation timed out after {args.timeout} seconds. This may indicate LLM judge endpoint issues.",
+            "error_type": "timeout",
+        }
+        sys.exit(print_result(result))
+
     except Exception as e:
+        # Cancel alarm on any exception
+        if hasattr(signal, 'SIGALRM'):
+            signal.alarm(0)
         sys.exit(handle_error(e, args.skill_name))
 
 

diff --git a/.test/skills/_routing/ground_truth.yaml b/.test/skills/_routing/ground_truth.yaml
@@ -99,7 +99,7 @@ test_cases:
       prompt: "Generate synthetic customer data and evaluate the agent quality with MLflow scorers"
     expectations:
       expected_skills:
-        - "databricks-synthetic-data-generation"
+        - "databricks-synthetic-data-gen"
         - "databricks-mlflow-evaluation"
       is_multi_skill: true
     metadata:

diff --git a/.test/skills/databricks-synthetic-data-gen/candidates.yaml b/.test/skills/databricks-synthetic-data-gen/candidates.yaml
@@ -0,0 +1,7 @@
+# Candidates for databricks-synthetic-data-gen skill
+# Test cases pending review before promotion to ground_truth.yaml
+#
+# Use `/skill-test databricks-synthetic-data-gen add` to create new candidates
+# Use `/skill-test databricks-synthetic-data-gen review` to promote candidates to ground truth
+
+candidates: []