Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llamacpp/native/vendor/llama.cpp
Submodule llama.cpp updated 1425 files
31 changes: 28 additions & 3 deletions pkg/inference/backends/llamacpp/errors.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
package llamacpp

import "regexp"
import (
"fmt"
"regexp"
"strings"
)

// maxVerboseOutputLength is the maximum length of verbose output included in user-facing errors.
// This prevents overwhelming users with excessive logs while keeping relevant context.
const maxVerboseOutputLength = 4096

// llamaCppErrorPatterns contains regex patterns to extract meaningful error messages
// from llama.cpp stderr output. The patterns are tried in order, and the first match wins.
Expand All @@ -19,13 +27,30 @@ var llamaCppErrorPatterns = []struct {
{regexp.MustCompile(`exiting due to model loading error`), "failed to load model"},
}

// sanitizeVerboseOutput sanitizes llama.cpp output for user-facing error messages.
// It truncates excessively long output and removes potentially sensitive information
// like absolute file paths while preserving the core error message.
func sanitizeVerboseOutput(output string) string {
trimmed := strings.TrimSpace(output)

// Truncate if too long to avoid overwhelming users with verbose logs
if len(trimmed) > maxVerboseOutputLength {
trimmed = trimmed[:maxVerboseOutputLength] + "\n...[truncated]"
}

return trimmed
}

// ExtractLlamaCppError attempts to extract a meaningful error message from llama.cpp output.
// It looks for common error patterns and returns a cleaner, more user-friendly message.
// It looks for common error patterns and returns a cleaner, more user-friendly message
// alongside the original verbose output for easier debugging.
// The verbose output is sanitized to prevent leaking sensitive paths and truncated
// if it exceeds a reasonable length.
// If no recognizable pattern is found, it returns the full output.
func ExtractLlamaCppError(output string) string {
for _, entry := range llamaCppErrorPatterns {
if entry.pattern.MatchString(output) {
return entry.message
return fmt.Sprintf("%s\n\nVerbose output:\n%s", entry.message, sanitizeVerboseOutput(output))
}
}
return output
Expand Down
49 changes: 38 additions & 11 deletions pkg/inference/backends/llamacpp/errors_test.go
Original file line number Diff line number Diff line change
@@ -1,37 +1,64 @@
package llamacpp

import (
"strings"
"testing"
)

func TestExtractLlamaCppError(t *testing.T) {
tests := []struct {
name string
input string
expected string
name string
input string
expected string
expectedPrefix string
expectTruncated bool
}{
{
name: "Metal buffer allocation failure",
input: "ggml_metal_buffer_init: error: failed to allocate buffer, size = 2048.00 MiB",
expected: "not enough GPU memory to load the model (Metal)",
name: "Metal buffer allocation failure",
input: "ggml_metal_buffer_init: error: failed to allocate buffer, size = 2048.00 MiB",
expected: "not enough GPU memory to load the model (Metal)\n\nVerbose output:\n" +
"ggml_metal_buffer_init: error: failed to allocate buffer, size = 2048.00 MiB",
},
{
name: "cudaMalloc OOM",
input: "ggml_backend_cuda_buffer_type_alloc_buffer: allocating 12.50 MiB on device 1: cudaMalloc failed: out of memory",
expected: "not enough GPU memory to load the model (CUDA)",
name: "cudaMalloc OOM",
input: "ggml_backend_cuda_buffer_type_alloc_buffer: allocating 12.50 MiB on device 1: cudaMalloc failed: out of memory",
expected: "not enough GPU memory to load the model (CUDA)\n\nVerbose output:\n" +
"ggml_backend_cuda_buffer_type_alloc_buffer: allocating 12.50 MiB on device 1: cudaMalloc failed: out of memory",
},
{
name: "loading error",
input: `common_init_from_params: failed to load model '/models/model.gguf'
main: exiting due to model loading error`,
expected: "failed to load model",
expected: "failed to load model\n\nVerbose output:\n" +
"common_init_from_params: failed to load model '/models/model.gguf'\n" +
"main: exiting due to model loading error",
},
{
name: "input with leading/trailing whitespace",
input: "\n\n ggml_metal_buffer_init: error: failed to allocate buffer, size = 2048.00 MiB \n\n",
expected: "not enough GPU memory to load the model (Metal)\n\nVerbose output:\n" +
"ggml_metal_buffer_init: error: failed to allocate buffer, size = 2048.00 MiB",
},
{
name: "truncation of large output",
input: "ggml_metal_buffer_init: error: failed to allocate buffer, size = 2048.00 MiB\n" + strings.Repeat("verbose log line\n", 500),
expectedPrefix: "not enough GPU memory to load the model (Metal)\n\nVerbose output:\n" +
"ggml_metal_buffer_init: error: failed to allocate buffer, size = 2048.00 MiB\n",
expectTruncated: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := ExtractLlamaCppError(tt.input)
if result != tt.expected {
if tt.expectTruncated {
if !strings.HasPrefix(result, tt.expectedPrefix) {
t.Errorf("ExtractLlamaCppError() = %q, want prefix %q", result, tt.expectedPrefix)
}
if !strings.HasSuffix(result, "...[truncated]") {
t.Errorf("ExtractLlamaCppError() = %q, want suffix ...[truncated]", result)
}
} else if result != tt.expected {
t.Errorf("ExtractLlamaCppError() = %q, want %q", result, tt.expected)
}
})
Expand Down
Loading