diff --git a/bench_test.go b/bench_test.go index ebf8a0d..e3b5521 100644 --- a/bench_test.go +++ b/bench_test.go @@ -25,6 +25,61 @@ func BenchmarkRedact(b *testing.B) { } } +type benchAddr struct { + Host string + Port int +} + +func (a benchAddr) SafeFormat(w SafePrinter, _ rune) { + w.Printf("%s:%d", a.Host, Safe(a.Port)) +} + +type benchRequest struct { + Method string + Path string + From benchAddr +} + +func (r benchRequest) SafeFormat(w SafePrinter, _ rune) { + w.Printf("%s %s from %v", Safe(r.Method), r.Path, r.From) +} + +func BenchmarkSprintfWithSafeFormatter(b *testing.B) { + req := benchRequest{ + Method: "GET", + Path: "/api/v1/users", + From: benchAddr{Host: "192.168.1.1", Port: 8080}, + } + + b.Run("single_struct", func(b *testing.B) { + addr := benchAddr{Host: "10.0.0.1", Port: 5432} + for i := 0; i < b.N; i++ { + _ = Sprintf("connecting to %v", addr) + } + }) + + b.Run("nested_structs", func(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = Sprintf("received %v", req) + } + }) + + b.Run("multiple_structs", func(b *testing.B) { + src := benchAddr{Host: "10.0.0.1", Port: 3000} + dst := benchAddr{Host: "10.0.0.2", Port: 5432} + for i := 0; i < b.N; i++ { + _ = Sprintf("proxy %v -> %v for %v", src, dst, req) + } + }) + + b.Run("sprint_mixed", func(b *testing.B) { + addr := benchAddr{Host: "10.0.0.1", Port: 5432} + for i := 0; i < b.N; i++ { + _ = Sprint("request ", req, " via ", addr, " user=", "alice") + } + }) +} + // BenchmarkRedactCall_PlainMarkers calls .Redact() on a string with only // regular ‹...› markers (no hash markers). This is the baseline. func BenchmarkRedactCall_RegularRedaction(b *testing.B) { @@ -57,4 +112,3 @@ func BenchmarkRedactCall_HashWithSalt(b *testing.B) { _ = s.Redact() } } - diff --git a/internal/escape/escape.go b/internal/escape/escape.go index 9249073..304945c 100644 --- a/internal/escape/escape.go +++ b/internal/escape/escape.go @@ -35,11 +35,23 @@ func InternalEscapeBytes(b []byte, startLoc int, breakNewLines, strip bool) (res // Note: we use len(...RedactableS) and not len(...RedactableBytes) // because the ...S variant is a compile-time constant so this // accelerates the loops below. - start, ls := m.StartBytes, len(m.StartS) - end, le := m.EndBytes, len(m.EndS) - hashPrefix, lh := m.HashPrefixBytes, len(m.HashPrefixS) + start := m.StartBytes + ls := len(m.StartS) + end := m.EndBytes + le := len(m.EndS) + hashPrefix := m.HashPrefixBytes + lh := len(m.HashPrefixS) escape := m.EscapeMarkBytes + // All markers share the same lead byte (0xE2) and second byte (0x80). + // This invariant is verified by the init() check in markers.go. + // We use this to skip over ASCII data quickly. + lead := start[0] + mid := start[1] + b2Start := start[2] + b2End := end[2] + b2Hash := hashPrefix[2] + // Trim final newlines/spaces, for convenience. if strip { end := len(b) @@ -64,68 +76,95 @@ func InternalEscapeBytes(b []byte, startLoc int, breakNewLines, strip bool) (res // already copied into res (if copied=true). k := 0 - for i := startLoc; i < len(b); i++ { - if breakNewLines && b[i] == '\n' { + for i := startLoc; i < len(b); { + // Use bytes.IndexByte to skip over runs of bytes that can't + // start a marker. The lead byte (0xE2) starts all marker + // sequences. When breakNewLines is false, we only need to find + // the lead byte. When true, we need to handle newlines too. + remaining := b[i:] + var idx int + if !breakNewLines { + idx = bytes.IndexByte(remaining, lead) + } else { + // Find the first byte that could be interesting: lead or newline. + // Use two IndexByte calls and take the minimum. + idxLead := bytes.IndexByte(remaining, lead) + idxNL := bytes.IndexByte(remaining, '\n') + if idxLead < 0 { + idx = idxNL + } else if idxNL < 0 { + idx = idxLead + } else if idxLead < idxNL { + idx = idxLead + } else { + idx = idxNL + } + } + if idx < 0 { + break + } + i += idx + c := b[i] + + if breakNewLines && c == '\n' { if !copied { - // We only allocate an output slice when we know we definitely - // need it. res = make([]byte, 0, len(b)) copied = true } res = append(res, b[k:i]...) - // Either add an end marker, or elide a start marker immediately prior. + + // Close the current redaction section before the newline. + // If the last thing we emitted was a start marker, remove + // it instead of producing an empty ‹› pair. if bytes.HasSuffix(res, start) { res = res[:len(res)-ls] } else { res = append(res, end...) } - // Advance to the last newline character. We want to forward - // them all in a single call to doWrite, for performance. + + // Emit all consecutive newlines as-is, outside any + // redaction envelope. lastNewLine := i for lastNewLine < len(b) && b[lastNewLine] == '\n' { lastNewLine++ } res = append(res, b[i:lastNewLine]...) + + // Reopen the redaction section for content after the + // newline(s). The caller will emit the closing marker. res = append(res, start...) k = lastNewLine - i = lastNewLine - 1 - } else - // Ensure that occurrences of the delimiter inside the string get - // escaped. - // Reminder: ls and le are likely greater than 1, as we are scanning - // utf-8 encoded delimiters (the utf-8 encoding is multibyte). - if i+ls <= len(b) && bytes.Equal(b[i:i+ls], start) { - if !copied { - // We only allocate an output slice when we know we definitely - // need it. - res = make([]byte, 0, len(b)+len(escape)) - copied = true - } - res = append(res, b[k:i]...) - res = append(res, escape...) - // Advance the counters by the length (in bytes) of the delimiter. - k = i + ls - i += ls - 1 /* -1 because we have i++ at the end of every iteration */ - } else if i+le <= len(b) && bytes.Equal(b[i:i+le], end) { - if !copied { - // See the comment above about res allocation. - res = make([]byte, 0, len(b)+len(escape)) - copied = true - } - res = append(res, b[k:i]...) - res = append(res, escape...) - // Advance the counters by the length (in bytes) of the delimiter. - k = i + le - i += le - 1 /* -1 because we have i++ at the end of every iteration */ - } else if i+lh <= len(b) && bytes.Equal(b[i:i+lh], hashPrefix) { + i = lastNewLine + continue + } + + // c == lead (0xE2). Check if we have a full marker. + if i+2 >= len(b) || b[i+1] != mid { + i++ + continue + } + + b2 := b[i+2] + markerLen := 0 + if b2 == b2Start { + markerLen = ls + } else if b2 == b2End { + markerLen = le + } else if b2 == b2Hash { + markerLen = lh + } + + if markerLen > 0 { if !copied { res = make([]byte, 0, len(b)+len(escape)) copied = true } res = append(res, b[k:i]...) res = append(res, escape...) - k = i + lh - i += lh - 1 + k = i + markerLen + i += markerLen + } else { + i++ } } // If the string terminates with an invalid utf-8 sequence, we diff --git a/internal/escape/escape_test.go b/internal/escape/escape_test.go index 219debe..48de603 100644 --- a/internal/escape/escape_test.go +++ b/internal/escape/escape_test.go @@ -24,22 +24,95 @@ func TestInternalEscape(t *testing.T) { strip bool expected string }{ + // Empty / nil inputs. {nil, 0, false, false, ""}, {[]byte(""), 0, false, false, ""}, + + // Pure ASCII, no markers. {[]byte("abc"), 0, false, false, "abc"}, + {[]byte("hello world 12345"), 0, false, false, "hello world 12345"}, + + // Start marker escaping. {[]byte("‹abc›"), 0, false, false, "?abc?"}, {[]byte("‹abc›"), 3, false, false, "‹abc?"}, {[]byte("‹abc›def›ghi"), 3, false, false, "‹abc?def?ghi"}, {[]byte("‹abc›"), len([]byte("‹abc›")), false, false, "‹abc›"}, {[]byte("‹abc›‹def›"), len([]byte("‹abc›")), false, false, "‹abc›?def?"}, + + // Multiple markers in sequence. + {[]byte("‹‹‹"), 0, false, false, "???"}, + {[]byte("›››"), 0, false, false, "???"}, + {[]byte("‹›‹›"), 0, false, false, "????"}, + + // Markers with surrounding text. + {[]byte("before‹mid›after"), 0, false, false, "before?mid?after"}, + {[]byte("a‹b›c‹d›e"), 0, false, false, "a?b?c?d?e"}, + + // Newline handling (breakNewLines=false, should not break). {[]byte("‹abc›\n‹d\nef›"), len([]byte("‹abc›")), false, false, "‹abc›\n?d\nef?"}, + + // Newline handling (breakNewLines=true). {[]byte("abc\n‹d\nef›\n \n\n "), len([]byte("abc")), true, false, "abc›\n‹?d›\n‹ef?›\n‹ ›\n\n‹ "}, {[]byte("abc\n‹d\nef›\n \n\n "), len([]byte("abc")), true, true, "abc›\n‹?d›\n‹ef?"}, {[]byte("‹abc› ‹def›"), len([]byte("‹abc› ")), true, true, "‹abc› ?def?"}, {[]byte("abc‹\ndef"), len([]byte("abc‹")), true, true, "abc\n‹def"}, + + // Multiple consecutive newlines with breakNewLines. + {[]byte("a\n\n\nb"), 0, true, false, "a›\n\n\n‹b"}, + {[]byte("\nabc"), 0, true, false, "›\n‹abc"}, + + // Hash prefix escaping. {[]byte("†abc"), 0, false, false, "?abc"}, {[]byte("‹†abc›"), 3, false, false, "‹?abc?"}, {[]byte("hello†world"), 0, false, false, "hello?world"}, + {[]byte("†"), 0, false, false, "?"}, + {[]byte("a†b†c"), 0, false, false, "a?b?c"}, + + // All three marker types together. + {[]byte("‹†›"), 0, false, false, "???"}, + + // Truncated lead byte at end of input (0xE2 without enough following bytes). + // 0xE2 alone at end — not a complete marker, should pass through. + {[]byte("abc\xe2"), 0, false, false, "abc\xe2?"}, + // 0xE2 0x80 at end — still not a complete marker. + {[]byte("abc\xe2\x80"), 0, false, false, "abc\xe2\x80?"}, + + // Lead byte 0xE2 with wrong second byte (not 0x80). + // This is a valid UTF-8 sequence but not a marker. + {[]byte("café"), 0, false, false, "café"}, // é = 0xC3 0xA9, no lead byte + {[]byte("abc\xe2\x82\xac def"), 0, false, false, "abc€ def"}, // € = E2 82 AC, lead matches but mid doesn't + + // Lead byte 0xE2 0x80 followed by non-marker third byte. + // U+2014 EM DASH = E2 80 94, shares lead+mid but third byte doesn't match. + {[]byte("hello\xe2\x80\x94world"), 0, false, false, "hello—world"}, + // U+2026 ELLIPSIS = E2 80 A6. + {[]byte("wait\xe2\x80\xa6"), 0, false, false, "wait…"}, + + // Trailing invalid UTF-8 (RuneError) — single invalid byte at end. + {[]byte("abc\xff"), 0, false, false, "abc\xff?"}, + // Invalid byte at end with no prior escaping needed. + {[]byte("hello\x80"), 0, false, false, "hello\x80?"}, + + // Invalid trailing byte combined with markers. + {[]byte("‹x›\xff"), 0, false, false, "?x?\xff?"}, + + // Strip mode. + {[]byte("abc \n"), 0, false, true, "abc"}, + {[]byte("abc "), 0, false, true, "abc"}, + {[]byte("abc\n\n\n"), 0, false, true, "abc"}, + + // Start offset beyond input length. + {[]byte("abc"), 5, false, false, "abc"}, + + // Start offset at exact end. + {[]byte("abc"), 3, false, false, "abc"}, + + // Markers only after start offset. + {[]byte("‹abc›‹def›"), 0, false, false, "?abc??def?"}, + + // breakNewLines with markers and newlines interleaved. + {[]byte("‹a\nb›"), 0, true, false, "?a›\n‹b?"}, + {[]byte("x\n‹y›\nz"), 0, true, false, "x›\n‹?y?›\n‹z"}, } for _, tc := range testCases {