servo · nicoburns · Feb 26, 2026 · Feb 26, 2026
diff --git a/Cargo.toml b/Cargo.toml
@@ -33,7 +33,6 @@ phf = "0.13"
 phf_codegen = "0.13"
 string_cache = { version = "0.9.0", default-features = false }
 string_cache_codegen = "0.6.1"
-utf-8 = "0.7"
 
 # Dev dependencies
 criterion = "0.8"

diff --git a/tendril/Cargo.toml b/tendril/Cargo.toml
@@ -19,7 +19,6 @@ encoding_rs = ["dep:encoding_rs"]
 [dependencies]
 encoding_rs = { workspace = true, optional = true}
 new_debug_unreachable = { workspace = true }
-utf-8 = { workspace = true }
 
 [dev-dependencies]
 rand = { workspace = true }

diff --git a/tendril/src/lib.rs b/tendril/src/lib.rs
@@ -25,6 +25,7 @@ pub mod stream;
 
 mod buf32;
 mod tendril;
+mod utf8;
 mod utf8_decode;
 mod util;
 

diff --git a/tendril/src/stream.rs b/tendril/src/stream.rs
@@ -15,9 +15,9 @@ use std::io;
 use std::marker::PhantomData;
 use std::path::Path;
 
+use crate::utf8;
 #[cfg(feature = "encoding_rs")]
 use encoding_rs::{self, DecoderResult};
-use utf8;
 
 /// Trait for types that can process a tendril.
 ///

diff --git a/tendril/src/utf8.rs b/tendril/src/utf8.rs
@@ -0,0 +1,141 @@
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+use std::cmp;
+use std::str;
+
+/// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error.
+pub(crate) const REPLACEMENT_CHARACTER: &str = "\u{FFFD}";
+
+#[derive(Debug, Copy, Clone)]
+pub(crate) enum DecodeError<'a> {
+    /// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`,
+    /// then call `decode()` again with `remaining_input`.
+    Invalid {
+        valid_prefix: &'a str,
+        invalid_sequence: &'a [u8],
+        #[allow(unused)]
+        remaining_input: &'a [u8],
+    },
+
+    /// Call the `incomplete_suffix.try_complete` method with more input when available.
+    /// If no more input is available, this is an invalid byte sequence.
+    Incomplete {
+        valid_prefix: &'a str,
+        incomplete_suffix: Incomplete,
+    },
+}
+
+#[derive(Debug, Copy, Clone)]
+pub(crate) struct Incomplete {
+    pub(crate) buffer: [u8; 4],
+    pub(crate) buffer_len: u8,
+}
+
+pub(crate) fn decode(input: &[u8]) -> Result<&str, DecodeError<'_>> {
+    let error = match str::from_utf8(input) {
+        Ok(valid) => return Ok(valid),
+        Err(error) => error,
+    };
+
+    // FIXME: separate function from here to guide inlining?
+    let (valid, after_valid) = input.split_at(error.valid_up_to());
+    let valid = unsafe { str::from_utf8_unchecked(valid) };
+
+    match error.error_len() {
+        Some(invalid_sequence_length) => {
+            let (invalid, rest) = after_valid.split_at(invalid_sequence_length);
+            Err(DecodeError::Invalid {
+                valid_prefix: valid,
+                invalid_sequence: invalid,
+                remaining_input: rest,
+            })
+        },
+        None => Err(DecodeError::Incomplete {
+            valid_prefix: valid,
+            incomplete_suffix: Incomplete::new(after_valid),
+        }),
+    }
+}
+
+impl Incomplete {
+    fn new(bytes: &[u8]) -> Self {
+        let mut buffer = [0, 0, 0, 0];
+        let len = bytes.len();
+        buffer[..len].copy_from_slice(bytes);
+        Incomplete {
+            buffer,
+            buffer_len: len as u8,
+        }
+    }
+
+    /// * `None`: still incomplete, call `try_complete` again with more input.
+    ///   If no more input is available, this is invalid byte sequence.
+    /// * `Some((result, remaining_input))`: We’re done with this `Incomplete`.
+    ///   To keep decoding, pass `remaining_input` to `decode()`.
+    #[allow(clippy::type_complexity)]
+    pub(crate) fn try_complete<'input>(
+        &mut self,
+        input: &'input [u8],
+    ) -> Option<(Result<&str, &[u8]>, &'input [u8])> {
+        let (consumed, opt_result) = self.try_complete_offsets(input);
+        let result = opt_result?;
+        let remaining_input = &input[consumed..];
+        let result_bytes = self.take_buffer();
+        let result = match result {
+            Ok(()) => Ok(unsafe { str::from_utf8_unchecked(result_bytes) }),
+            Err(()) => Err(result_bytes),
+        };
+        Some((result, remaining_input))
+    }
+
+    fn take_buffer(&mut self) -> &[u8] {
+        let len = self.buffer_len as usize;
+        self.buffer_len = 0;
+        &self.buffer[..len]
+    }
+
+    /// (consumed_from_input, None): not enough input
+    /// (consumed_from_input, Some(Err(()))): error bytes in buffer
+    /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer
+    fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option<Result<(), ()>>) {
+        let initial_buffer_len = self.buffer_len as usize;
+        let copied_from_input;
+        {
+            let unwritten = &mut self.buffer[initial_buffer_len..];
+            copied_from_input = cmp::min(unwritten.len(), input.len());
+            unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]);
+        }
+        let spliced = &self.buffer[..initial_buffer_len + copied_from_input];
+        match str::from_utf8(spliced) {
+            Ok(_) => {
+                self.buffer_len = spliced.len() as u8;
+                (copied_from_input, Some(Ok(())))
+            },
+            Err(error) => {
+                let valid_up_to = error.valid_up_to();
+                if valid_up_to > 0 {
+                    let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap();
+                    self.buffer_len = valid_up_to as u8;
+                    (consumed, Some(Ok(())))
+                } else {
+                    match error.error_len() {
+                        Some(invalid_sequence_length) => {
+                            let consumed = invalid_sequence_length
+                                .checked_sub(initial_buffer_len)
+                                .unwrap();
+                            self.buffer_len = invalid_sequence_length as u8;
+                            (consumed, Some(Err(())))
+                        },
+                        None => {
+                            self.buffer_len = spliced.len() as u8;
+                            (copied_from_input, None)
+                        },
+                    }
+                }
+            },
+        }
+    }
+}
diff --git a/tendril/src/utf8_decode.rs b/tendril/src/utf8_decode.rs
@@ -5,6 +5,7 @@
 // except according to those terms.
 
 use crate::fmt;
+use crate::utf8;
 use crate::{Atomicity, Tendril};
 
 pub struct IncompleteUtf8(utf8::Incomplete);