From f13a4e659228fac888d529d4027b72f36917428f Mon Sep 17 00:00:00 2001 From: Carlo Federico Vescovo Date: Fri, 23 Jan 2026 14:33:54 -0600 Subject: [PATCH 1/3] Upgrade ego-tree to 0.11.0 and html5ever to 0.37.1 - Bump ego-tree from 0.10.0 to 0.11.0 - Bump html5ever from 0.36.0 to 0.37.1 - Bump tendril from 0.4.3 to 0.5.0 (required by html5ever 0.37.1) - Add clone_subtree() method to TreeSink implementation --- Cargo.lock | 40 ++++++++++------------------------- scraper/Cargo.toml | 6 +++--- scraper/src/html/tree_sink.rs | 35 +++++++++++++++++++++++++++++- 3 files changed, 48 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f9efe932..58ba7a41 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -75,9 +75,9 @@ dependencies = [ [[package]] name = "ego-tree" -version = "0.10.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8" +checksum = "b04dc5a38e4f151a79d9f2451ae6037fb6eaf5cba34771f44781f80e508498e3" [[package]] name = "equivalent" @@ -91,16 +91,6 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" -[[package]] -name = "futf" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" -dependencies = [ - "mac", - "new_debug_unreachable", -] - [[package]] name = "getopts" version = "0.2.24" @@ -118,9 +108,9 @@ checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" [[package]] name = "html5ever" -version = "0.36.1" +version = "0.37.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6452c4751a24e1b99c3260d505eaeee76a050573e61f30ac2c924ddc7236f01e" +checksum = "5935f02fdc02823ff15fec27c2b3d7ca19d629e996f7a0ae4d7d500e62e54c76" dependencies = [ "log", "markup5ever", @@ -163,17 +153,11 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" -[[package]] -name = "mac" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" - [[package]] name = "markup5ever" -version = "0.36.1" +version = "0.37.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c3294c4d74d0742910f8c7b466f44dda9eb2d5742c1e430138df290a1e8451c" +checksum = "7cfb33ea12d5d83b1ba9a55ae7d05faec4f2189d47b79c04d4cea6bbe9f5b083" dependencies = [ "log", "tendril", @@ -422,7 +406,6 @@ dependencies = [ "parking_lot", "phf_shared", "precomputed-hash", - "serde", ] [[package]] @@ -450,12 +433,11 @@ dependencies = [ [[package]] name = "tendril" -version = "0.4.3" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +checksum = "c4790fc369d5a530f4b544b094e31388b9b3a37c0f4652ade4505945f5660d24" dependencies = [ - "futf", - "mac", + "new_debug_unreachable", "utf-8", ] @@ -479,9 +461,9 @@ checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" [[package]] name = "web_atoms" -version = "0.2.0" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acd0c322f146d0f8aad130ce6c187953889359584497dac6561204c8e17bb43d" +checksum = "c7fa72497c57079de16225d9a886d6c9a80c34f8e5a9cd5c64b71a449cbba195" dependencies = [ "phf", "phf_codegen", diff --git a/scraper/Cargo.toml b/scraper/Cargo.toml index f1534d10..4a2780ad 100644 --- a/scraper/Cargo.toml +++ b/scraper/Cargo.toml @@ -14,13 +14,13 @@ readme = "README.md" [dependencies] cssparser = "0.36.0" -ego-tree = "0.10.0" -html5ever = "0.36.0" +ego-tree = "0.11.0" +html5ever = "0.37.1" indexmap = { version = "2.13.0", optional = true } precomputed-hash = "0.1.1" selectors = "0.35.0" serde = { version = "1.0.228", optional = true } -tendril = "0.4.3" +tendril = "0.5.0" [dependencies.getopts] version = "0.2.24" diff --git a/scraper/src/html/tree_sink.rs b/scraper/src/html/tree_sink.rs index 8af46156..4efc3096 100644 --- a/scraper/src/html/tree_sink.rs +++ b/scraper/src/html/tree_sink.rs @@ -1,7 +1,7 @@ use super::Html; use crate::node::{Comment, Doctype, Element, Node, ProcessingInstruction, Text}; use crate::tendril_util::make as make_tendril; -use ego_tree::NodeId; +use ego_tree::{NodeId, Tree}; use html5ever::tendril::StrTendril; use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; use html5ever::Attribute; @@ -294,4 +294,37 @@ impl TreeSink for HtmlTreeSink { self.append(prev_element, child) } } + + // Clone the subtree rooted at the given node. + fn clone_subtree(&self, target: &Self::Handle) -> Self::Handle { + let this = self.0.borrow(); + + fn clone_node_recursive( + tree: &Tree, + node_id: NodeId, + new_tree: &mut Tree, + parent_id: Option, + ) -> NodeId { + let node = tree.get(node_id).unwrap(); + let cloned_value = node.value().clone(); + + let new_node_id = if let Some(parent) = parent_id { + let mut parent_node = new_tree.get_mut(parent).unwrap(); + parent_node.append(cloned_value); + parent_node.last_child().unwrap().id() + } else { + new_tree.orphan(cloned_value).id() + }; + + for child in node.children() { + clone_node_recursive(tree, child.id(), new_tree, Some(new_node_id)); + } + + new_node_id + } + + drop(this); + let mut this = self.0.borrow_mut(); + clone_node_recursive(&this.tree.clone(), *target, &mut this.tree, None) + } } From 146ea83c5720b6b31447beedd4ae03c39ee5f61b Mon Sep 17 00:00:00 2001 From: Carlo Federico Vescovo Date: Fri, 23 Jan 2026 14:38:55 -0600 Subject: [PATCH 2/3] clippy fixes --- scraper/src/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scraper/src/main.rs b/scraper/src/main.rs index 78016eac..72328670 100644 --- a/scraper/src/main.rs +++ b/scraper/src/main.rs @@ -75,7 +75,7 @@ fn main() { let matches = match opts.parse(&args[1..]) { Ok(m) => m, Err(f) => { - eprintln!("{}", f); + eprintln!("{f}"); process::exit(USAGE); } }; @@ -131,7 +131,7 @@ fn main() { let files = &matches.free[1..]; let selector = Selector::parse(selector).unwrap_or_else(|e| { - eprintln!("failed to parse selector: {}", e); + eprintln!("failed to parse selector: {e}"); process::exit(USAGE); }); From f28f7264ab2ad1b08fc31ea6d867ce486d7c7c0f Mon Sep 17 00:00:00 2001 From: Carlo Federico Vescovo Date: Fri, 23 Jan 2026 14:51:13 -0600 Subject: [PATCH 3/3] Use ego-tree clone_subtree method --- scraper/src/html/tree_sink.rs | 38 +++++++---------------------------- 1 file changed, 7 insertions(+), 31 deletions(-) diff --git a/scraper/src/html/tree_sink.rs b/scraper/src/html/tree_sink.rs index 4efc3096..8b3d0205 100644 --- a/scraper/src/html/tree_sink.rs +++ b/scraper/src/html/tree_sink.rs @@ -1,7 +1,7 @@ use super::Html; use crate::node::{Comment, Doctype, Element, Node, ProcessingInstruction, Text}; use crate::tendril_util::make as make_tendril; -use ego_tree::{NodeId, Tree}; +use ego_tree::NodeId; use html5ever::tendril::StrTendril; use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; use html5ever::Attribute; @@ -295,36 +295,12 @@ impl TreeSink for HtmlTreeSink { } } - // Clone the subtree rooted at the given node. fn clone_subtree(&self, target: &Self::Handle) -> Self::Handle { - let this = self.0.borrow(); - - fn clone_node_recursive( - tree: &Tree, - node_id: NodeId, - new_tree: &mut Tree, - parent_id: Option, - ) -> NodeId { - let node = tree.get(node_id).unwrap(); - let cloned_value = node.value().clone(); - - let new_node_id = if let Some(parent) = parent_id { - let mut parent_node = new_tree.get_mut(parent).unwrap(); - parent_node.append(cloned_value); - parent_node.last_child().unwrap().id() - } else { - new_tree.orphan(cloned_value).id() - }; - - for child in node.children() { - clone_node_recursive(tree, child.id(), new_tree, Some(new_node_id)); - } - - new_node_id - } - - drop(this); - let mut this = self.0.borrow_mut(); - clone_node_recursive(&this.tree.clone(), *target, &mut this.tree, None) + let mut html = self.0.borrow_mut(); + + let mut source_node = html.tree.get_mut(*target).unwrap(); + let cloned_subtree = source_node.clone_subtree(); + + cloned_subtree.id() } }