docker · ericcurtin · Mar 21, 2026
diff --git a/llamacpp/native/vendor/llama.cpp b/llamacpp/native/vendor/llama.cpp
diff --git a/pkg/inference/backends/llamacpp/errors.go b/pkg/inference/backends/llamacpp/errors.go
@@ -1,6 +1,14 @@
 package llamacpp
 
-import "regexp"
+import (
+	"fmt"
+	"regexp"
+	"strings"
+)
+
+// maxVerboseOutputLength is the maximum length of verbose output included in user-facing errors.
+// This prevents overwhelming users with excessive logs while keeping relevant context.
+const maxVerboseOutputLength = 4096
 
 // llamaCppErrorPatterns contains regex patterns to extract meaningful error messages
 // from llama.cpp stderr output. The patterns are tried in order, and the first match wins.
@@ -19,13 +27,30 @@ var llamaCppErrorPatterns = []struct {
 	{regexp.MustCompile(`exiting due to model loading error`), "failed to load model"},
 }
 
+// sanitizeVerboseOutput sanitizes llama.cpp output for user-facing error messages.
+// It truncates excessively long output and removes potentially sensitive information
+// like absolute file paths while preserving the core error message.
+func sanitizeVerboseOutput(output string) string {
+	trimmed := strings.TrimSpace(output)
+
+	// Truncate if too long to avoid overwhelming users with verbose logs
+	if len(trimmed) > maxVerboseOutputLength {
+		trimmed = trimmed[:maxVerboseOutputLength] + "\n...[truncated]"
+	}
+
+	return trimmed
+}
+
 // ExtractLlamaCppError attempts to extract a meaningful error message from llama.cpp output.
-// It looks for common error patterns and returns a cleaner, more user-friendly message.
+// It looks for common error patterns and returns a cleaner, more user-friendly message
+// alongside the original verbose output for easier debugging.
+// The verbose output is sanitized to prevent leaking sensitive paths and truncated
+// if it exceeds a reasonable length.
 // If no recognizable pattern is found, it returns the full output.
 func ExtractLlamaCppError(output string) string {
 	for _, entry := range llamaCppErrorPatterns {
 		if entry.pattern.MatchString(output) {
-			return entry.message
+			return fmt.Sprintf("%s\n\nVerbose output:\n%s", entry.message, sanitizeVerboseOutput(output))
 		}
 	}
 	return output

diff --git a/pkg/inference/backends/llamacpp/errors_test.go b/pkg/inference/backends/llamacpp/errors_test.go
@@ -1,37 +1,64 @@
 package llamacpp
 
 import (
+	"strings"
 	"testing"
 )
 
 func TestExtractLlamaCppError(t *testing.T) {
 	tests := []struct {
-		name     string
-		input    string
-		expected string
+		name            string
+		input           string
+		expected        string
+		expectedPrefix  string
+		expectTruncated bool
 	}{
 		{
-			name:     "Metal buffer allocation failure",
-			input:    "ggml_metal_buffer_init: error: failed to allocate buffer, size = 2048.00 MiB",
-			expected: "not enough GPU memory to load the model (Metal)",
+			name:  "Metal buffer allocation failure",
+			input: "ggml_metal_buffer_init: error: failed to allocate buffer, size = 2048.00 MiB",
+			expected: "not enough GPU memory to load the model (Metal)\n\nVerbose output:\n" +
+				"ggml_metal_buffer_init: error: failed to allocate buffer, size = 2048.00 MiB",
 		},
 		{
-			name:     "cudaMalloc OOM",
-			input:    "ggml_backend_cuda_buffer_type_alloc_buffer: allocating 12.50 MiB on device 1: cudaMalloc failed: out of memory",
-			expected: "not enough GPU memory to load the model (CUDA)",
+			name:  "cudaMalloc OOM",
+			input: "ggml_backend_cuda_buffer_type_alloc_buffer: allocating 12.50 MiB on device 1: cudaMalloc failed: out of memory",
+			expected: "not enough GPU memory to load the model (CUDA)\n\nVerbose output:\n" +
+				"ggml_backend_cuda_buffer_type_alloc_buffer: allocating 12.50 MiB on device 1: cudaMalloc failed: out of memory",
 		},
 		{
 			name: "loading error",
 			input: `common_init_from_params: failed to load model '/models/model.gguf'
 main: exiting due to model loading error`,
-			expected: "failed to load model",
+			expected: "failed to load model\n\nVerbose output:\n" +
+				"common_init_from_params: failed to load model '/models/model.gguf'\n" +
+				"main: exiting due to model loading error",
+		},
+		{
+			name:  "input with leading/trailing whitespace",
+			input: "\n\n  ggml_metal_buffer_init: error: failed to allocate buffer, size = 2048.00 MiB  \n\n",
+			expected: "not enough GPU memory to load the model (Metal)\n\nVerbose output:\n" +
+				"ggml_metal_buffer_init: error: failed to allocate buffer, size = 2048.00 MiB",
+		},
+		{
+			name:  "truncation of large output",
+			input: "ggml_metal_buffer_init: error: failed to allocate buffer, size = 2048.00 MiB\n" + strings.Repeat("verbose log line\n", 500),
+			expectedPrefix: "not enough GPU memory to load the model (Metal)\n\nVerbose output:\n" +
+				"ggml_metal_buffer_init: error: failed to allocate buffer, size = 2048.00 MiB\n",
+			expectTruncated: true,
 		},
 	}
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			result := ExtractLlamaCppError(tt.input)
-			if result != tt.expected {
+			if tt.expectTruncated {
+				if !strings.HasPrefix(result, tt.expectedPrefix) {
+					t.Errorf("ExtractLlamaCppError() = %q, want prefix %q", result, tt.expectedPrefix)
+				}
+				if !strings.HasSuffix(result, "...[truncated]") {
+					t.Errorf("ExtractLlamaCppError() = %q, want suffix ...[truncated]", result)
+				}
+			} else if result != tt.expected {
 				t.Errorf("ExtractLlamaCppError() = %q, want %q", result, tt.expected)
 			}
 		})